처음 사용하는 선형 회귀 코드로부터 시작하는 등 다음입니다 :
from sklearn import linear_model as lm
import numpy as np
import os
import pandas as pd
def read_data(path):
"""
使用pandas读取数据
"""
return pd.read_csv(path)
def train_model(train_data, features, labels):
"""
根据训练数据集训练模型,并返回训练好的模型
:param train_data:
:param features:
:param labels:
:return:
"""
model = lm.LinearRegression()
model.fit(train_data[features], train_data[labels])
print(model.intercept_)
print(model.coef_)
return model
def linear_model(data, data_number):
"""
:param data:
:return:
"""
# 特征的名称,和数据文件中第一行标题行对应
features = ["x"]
# 标签名称,和数据文件中第一行标题行对应
labels = ["y"]
# 将数据分为训练数据集和测试数据集,以data_number为分割线,下标0~data_number的为训练集
train_data = data[:data_number]
test_data = data[data_number:]
# 训练模型
model = train_model(train_data, features, labels)
if __name__ == "__main__":
home_path = os.path.dirname(os.path.abspath(__file__))
# Windows下的存储路径与Linux并不相同
if os.name == "nt":
dataPath = "%s\\data\\simple_example.csv" % home_path
else:
dataPath = "%s/data/simple_example.csv" % home_path
data = read_data(dataPath)
linear_model(data, data_number=15)
다음과 같이 simple_example.csv 데이터 문서 읽기 :
x,y
10,7.7
10,9.87
11,11.18
12,10.43
13,12.36
14,14.15
15,15.73
16,16.4
17,18.86
18,16.13
19,18.21
20,18.37
21,22.61
22,19.83
이 프로그램의 기본 개념
15 전에 읽기 읽기 팬더 CSV 형식의 데이터를 이용하여 (1), 파라미터 설정 data_number;
(2) 모델이 훈련 데이터는 두 단계로 분할되어, 선형 회귀 모델을 형성하는 첫 번째 단계는, 코어 코드로서 다음하는 두번째 단계를 판독하여 적합 :
모델 lm.LinearRegression = ()
model.fit (train_data [특징] train_data [상표])
다음과 같이 맞춤 프로그램 결과는 :
Y는 = 0.62794705 1.01211289x-을
다음과 같이 적합한 방법으로 볼 수있다,이 메소드 호출의 핵심은 다음과 같습니다
linalg.lstsq (X, Y)
그것은 scipy 패키지는 다음과 같이 최소 제곱 법 방법 scipy scipy 소스 / scipy / linalg / basic.py 전체 소스 코드를 해결 도끼 = B 솔루션에 대한 방법을 제공하는 것이다 :
# Linear Least Squares
def lstsq(a, b, cond=None, overwrite_a=False, overwrite_b=False,
check_finite=True, lapack_driver=None):
"""
省略了注释...
"""
a1 = _asarray_validated(a, check_finite=check_finite)
b1 = _asarray_validated(b, check_finite=check_finite)
if len(a1.shape) != 2:
raise ValueError('Input array a should be 2-D')
m, n = a1.shape
if len(b1.shape) == 2:
nrhs = b1.shape[1]
else:
nrhs = 1
if m != b1.shape[0]:
raise ValueError('Shape mismatch: a and b should have the same number'
' of rows ({} != {}).'.format(m, b1.shape[0]))
if m == 0 or n == 0: # Zero-sized problem, confuses LAPACK
x = np.zeros((n,) + b1.shape[1:], dtype=np.common_type(a1, b1))
if n == 0:
residues = np.linalg.norm(b1, axis=0)**2
else:
residues = np.empty((0,))
return x, residues, 0, np.empty((0,))
driver = lapack_driver
if driver is None:
driver = lstsq.default_lapack_driver
if driver not in ('gelsd', 'gelsy', 'gelss'):
raise ValueError('LAPACK driver "%s" is not found' % driver)
lapack_func, lapack_lwork = get_lapack_funcs((driver,
'%s_lwork' % driver),
(a1, b1))
real_data = True if (lapack_func.dtype.kind == 'f') else False
if m < n:
# need to extend b matrix as it will be filled with
# a larger solution matrix
if len(b1.shape) == 2:
b2 = np.zeros((n, nrhs), dtype=lapack_func.dtype)
b2[:m, :] = b1
else:
b2 = np.zeros(n, dtype=lapack_func.dtype)
b2[:m] = b1
b1 = b2
overwrite_a = overwrite_a or _datacopied(a1, a)
overwrite_b = overwrite_b or _datacopied(b1, b)
if cond is None:
cond = np.finfo(lapack_func.dtype).eps
if driver in ('gelss', 'gelsd'):
if driver == 'gelss':
lwork = _compute_lwork(lapack_lwork, m, n, nrhs, cond)
v, x, s, rank, work, info = lapack_func(a1, b1, cond, lwork,
overwrite_a=overwrite_a,
overwrite_b=overwrite_b)
elif driver == 'gelsd':
if real_data:
lwork, iwork = _compute_lwork(lapack_lwork, m, n, nrhs, cond)
x, s, rank, info = lapack_func(a1, b1, lwork,
iwork, cond, False, False)
else: # complex data
lwork, rwork, iwork = _compute_lwork(lapack_lwork, m, n,
nrhs, cond)
x, s, rank, info = lapack_func(a1, b1, lwork, rwork, iwork,
cond, False, False)
if info > 0:
raise LinAlgError("SVD did not converge in Linear Least Squares")
if info < 0:
raise ValueError('illegal value in %d-th argument of internal %s'
% (-info, lapack_driver))
resids = np.asarray([], dtype=x.dtype)
if m > n:
x1 = x[:n]
if rank == n:
resids = np.sum(np.abs(x[n:])**2, axis=0)
x = x1
return x, resids, rank, s
elif driver == 'gelsy':
lwork = _compute_lwork(lapack_lwork, m, n, nrhs, cond)
jptv = np.zeros((a1.shape[1], 1), dtype=np.int32)
v, x, j, rank, info = lapack_func(a1, b1, jptv, cond,
lwork, False, False)
if info < 0:
raise ValueError("illegal value in %d-th argument of internal "
"gelsy" % -info)
if m > n:
x1 = x[:n]
x = x1
return x, np.array([], x.dtype), rank, None
lstsq.default_lapack_driver = 'gelsd'
해결하는 방법은 세 가지 제공합니다 : 'gelsd', 'gelsy', 'gelss'을, 우리는 당신이 예에서 기본 gelsd를 사용하려면 매개 변수를 전달하는 방법을 제공하지 않았다 발견했다. 우리는 다음과 같은 코드에 집중할 수 있도록 :
elif driver == 'gelsd':
if real_data:
lwork, iwork = _compute_lwork(lapack_lwork, m, n, nrhs, cond)
x, s, rank, info = lapack_func(a1, b1, lwork,
iwork, cond, False, False)
else: # complex data
lwork, rwork, iwork = _compute_lwork(lapack_lwork, m, n,
nrhs, cond)
x, s, rank, info = lapack_func(a1, b1, lwork, rwork, iwork,
cond, False, False)
경우 실수, 그래서 전화 것을 감안할 때 _compute_lwork 및 lapack_func 방법. basic.py 같은 디렉토리에 lapack.py에서 _compute_lwork, 소스 코드는 다음입니다 :
def _compute_lwork(routine, *args, **kwargs):
"""
Round floating-point lwork returned by lapack to integer.
Several LAPACK routines compute optimal values for LWORK, which
they return in a floating-point variable. However, for large
values of LWORK, single-precision floating point is not sufficient
to hold the exact value --- some LAPACK versions (<= 3.5.0 at
least) truncate the returned integer to single precision and in
some cases this can be smaller than the required value.
Examples
--------
>>> from scipy.linalg import lapack
>>> n = 5000
>>> s_r, s_lw = lapack.get_lapack_funcs(('sysvx', 'sysvx_lwork'))
>>> lwork = lapack._compute_lwork(s_lw, n)
>>> lwork
32000
"""
wi = routine(*args, **kwargs)
if len(wi) < 2:
raise ValueError('')
info = wi[-1]
if info != 0:
raise ValueError("Internal work array size computation failed: "
"%d" % (info,))
lwork = [w.real for w in wi[:-1]]
dtype = getattr(routine, 'dtype', None)
if dtype == _np.float32 or dtype == _np.complex64:
# Single-precision routine -- take next fp value to work
# around possible truncation in LAPACK code
lwork = _np.nextafter(lwork, _np.inf, dtype=_np.float32)
lwork = _np.array(lwork, _np.int64)
if _np.any(_np.logical_or(lwork < 0, lwork > _np.iinfo(_np.int32).max)):
raise ValueError("Too large work array required -- computation cannot "
"be performed with standard 32-bit LAPACK.")
lwork = lwork.astype(_np.int32)
if lwork.size == 1:
return lwork[0]
return lwork
다음 lapack_func가 get_lapack_funcs로부터 유도되며, 또한 lapack.py 소스 코드 :
def get_lapack_funcs(names, arrays=(), dtype=None):
"""
省略部分注释...
In LAPACK, the naming convention is that all functions start with a
type prefix, which depends on the type of the principal
matrix. These can be one of {'s', 'd', 'c', 'z'} for the numpy
types {float32, float64, complex64, complex128} respectively, and
are stored in attribute ``typecode`` of the returned functions.
"""
return _get_funcs(names, arrays, dtype,
"LAPACK", _flapack, _clapack,
"flapack", "clapack", _lapack_alias)
실제 함수 호출 우리는 주석에서 참조 _get_funcs이며, 프리픽스에 함수의 모든 유형 LAPACK 명명 규칙은 이것이 최초의 매트릭스 방식에 의해 결정되고, 여기에서 'S', 'D', 'C', ' Z D '네 가지 유형은, 우리의 데이타는 정확하게에 대응 float64입니다.' 함수 이름의 맨 아래가 호출된다 :
scipy.linalg.lapack.dgelsd
도표 :
실제 호출은이다 포트란 객체입니다. 그리고 포트란 언어를 사용하여 동일한 디렉토리에 flapack_gen.pyf.src에서이 기능은, 소스 코드는 다음과 같습니다 :
subroutine <prefix2>gelsd(m,n,minmn,maxmn,nrhs,a,b,s,cond,r,work,lwork,size_iwork,iwork,info)
! x,s,rank,info = dgelsd(a,b,lwork,size_iwork,cond=-1.0,overwrite_a=True,overwrite_b=True)
! Solve Minimize 2-norm(A * X - B).
callstatement (*f2py_func)(&m,&n,&nrhs,a,&m,b,&maxmn,s,&cond,&r,work,&lwork,iwork,&info)
callprotoargument int*,int*,int*,<ctype2>*,int*,<ctype2>*,int*,<ctype2>*,<ctype2>*,int*,<ctype2>*,int*,int*,int*
integer intent(hide),depend(a):: m = shape(a,0)
integer intent(hide),depend(a):: n = shape(a,1)
integer intent(hide),depend(m,n):: minmn = MIN(m,n)
integer intent(hide),depend(m,n):: maxmn = MAX(m,n)
<ftype2> dimension(m,n),intent(in,copy) :: a
integer depend(b),intent(hide):: nrhs = shape(b,1)
<ftype2> dimension(maxmn,nrhs),check(maxmn==shape(b,0)),depend(maxmn) :: b
intent(in,out,copy,out=x) b
<ftype2> intent(in),optional :: cond=-1.0
integer intent(out,out=rank) :: r
<ftype2> intent(out),dimension(minmn),depend(minmn) :: s
integer intent(in),check(lwork>=1) :: lwork
! Impossible to calculate lwork explicitly, need to obtain it from query call first
! Same for size_iwork
<ftype2> dimension(lwork),intent(cache,hide),depend(lwork) :: work
integer intent(in) :: size_iwork
integer intent(cache,hide),dimension(MAX(1,size_iwork)),depend(size_iwork) :: iwork
integer intent(out)::info
end subroutine <prefix2>gelsd