import numpy as np
import pandas as pd
from pandas import Series, DataFrame
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
# 先导入文件
salary = pd.read_csv('./day9_data/adults.txt')
salary.head()
# 工作类型 # 教育程度 # 教育年限 # 职位 #
# workclass education education_num occupation hours_per_week
age | workclass | final_weight | education | education_num | marital_status | occupation | relationship | race | sex | capital_gain | capital_loss | hours_per_week | native_country | salary | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 39 | State-gov | 77516 | Bachelors | 13 | Never-married | Adm-clerical | Not-in-family | White | Male | 2174 | 0 | 40 | United-States | <=50K |
1 | 50 | Self-emp-not-inc | 83311 | Bachelors | 13 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 13 | United-States | <=50K |
2 | 38 | Private | 215646 | HS-grad | 9 | Divorced | Handlers-cleaners | Not-in-family | White | Male | 0 | 0 | 40 | United-States | <=50K |
3 | 53 | Private | 234721 | 11th | 7 | Married-civ-spouse | Handlers-cleaners | Husband | Black | Male | 0 | 0 | 40 | United-States | <=50K |
4 | 28 | Private | 338409 | Bachelors | 13 | Married-civ-spouse | Prof-specialty | Wife | Black | Female | 0 | 0 | 40 | Cuba | <=50K |
salary.dtypes
- 输出 age int64 workclass object final_weight int64 education object education_num int64 marital_status object occupation object relationship object race object sex object capital_gain int64 capital_loss int64 hours_per_week int64 native_country object salary object dtype: object
salary.shape
- 输出 (32561, 15)
salary.columns
- 输出 Index([‘age’, ‘workclass’, ‘final_weight’, ‘education’, ‘education_num’, ‘marital_status’, ‘occupation’, ‘relationship’, ‘race’, ‘sex’, ‘capital_gain’, ‘capital_loss’, ‘hours_per_week’, ‘native_country’, ‘salary’], dtype=’object’)
target = salary['salary']
data = salary[['age', 'workclass', 'education', 'education_num', 'occupation', 'sex', 'hours_per_week', 'native_country','race']]
# 数据有了,目标有了
# 因为有字符串类型,机器学习不支持
# 转换成映射
knn = KNeighborsClassifier(n_neighbors=15)
data.dtypes
- 输出 age int64 workclass int64 education int64 education_num int64 occupation int64 sex int64 hours_per_week int64 native_country int64 race int64 dtype: object
# unique去重得到所有的工作类型
unique_ = data['workclass'].unique()
def convertstr2int(item):
return np.argwhere(unique_ == item)[0, 0] + 1
data['workclass'] = data['workclass'].map(convertstr2int)
cols = ['education','occupation', 'sex', 'native_country', 'race']
for col in cols:
# 查找每一列中不同的值
unique_ = data[col].unique()
# 找出值的索引, 索引最好不为0
def convertstr2int(item):
return np.argwhere(unique_ == item)[0, 0] + 1
# 将该字段替换成映射值
data[col] = data[col].map(convertstr2int)
data.head()
age | workclass | education | education_num | occupation | sex | hours_per_week | native_country | race | |
---|---|---|---|---|---|---|---|---|---|
0 | 39 | 1 | 1 | 13 | 1 | 1 | 40 | 1 | 1 |
1 | 50 | 2 | 1 | 13 | 2 | 1 | 13 | 1 | 1 |
2 | 38 | 3 | 2 | 9 | 3 | 1 | 40 | 1 | 1 |
3 | 53 | 3 | 3 | 7 | 3 | 1 | 40 | 1 | 2 |
4 | 28 | 3 | 1 | 13 | 4 | 2 | 40 | 2 | 2 |
# 分割数据
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.005)
# 实例化
knn = KNeighborsClassifier(n_neighbors=15)
# 开始训练数据
knn.fit(X_train, y_train)
- 输出 KNeighborsClassifier(algorithm=’auto’, leaf_size=30, metric=’minkowski’, metric_params=None, n_jobs=1, n_neighbors=15, p=2, weights=’uniform’)
# 开始预测
y_ = knn.predict(X_test)
# 准确率
knn.score(X_test,y_test)
- 输出 0.803680981595092 # 保存训练的模型 脸部识别的算法是被打包的 数学建模
from sklearn.externals import joblib
# 算法大部分都使用.m的尾缀
joblib.dump(knn, './50K.m')
- 输出 [‘./50K.m’]
# 有打包 就肯定有加载算法
# CV2
# 咱们sklearn自带加载算法的方法
knn_50k = joblib.load('./50K.m')
knn_50k.score(X_test,y_test)
- 输出 0.803680981595092 # 使用算法,识别是否是乳腺癌
分布式存储的原理
分布式:基于一主多从,多台从服务器监听主服务器,主服务器开放3306端口,开放对从服务器IP地址的支持,
都是基于binlog = 二进制传输流
主服务器 都是innodb 负责插入 (.frm 表的结构,表的索引)(.ibd储存的是数据)master
从服务器slave 要设置主服务的ip 主服务器的端口, 一个主服务器的mysql账号
需要主服务的 二进制传输文件编号 文件端口号
新式类 经典类
就是一台主机让多台从机允许监听,然后从服务器不停拷贝主服务器中的数据
import numpy as np
import pandas as pd
from pandas import DataFrame
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
# 引入数据
# sep 的默认值是逗号 大部分的csv都是以‘,’分割的
canser = pd.read_csv('./day9_data/cancer.csv',sep='\t')
canser.head()
# Diagnosis 诊断 M是良性 B是恶性
ID | Diagnosis | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave_mean | … | radius_max | texture_max | perimeter_max | area_max | smoothness_max | compactness_max | concavity_max | concave_max | symmetry_max | fractal_max | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 842302 | M | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 | 0.27760 | 0.3001 | 0.14710 | … | 25.38 | 17.33 | 184.60 | 2019.0 | 0.1622 | 0.6656 | 0.7119 | 0.2654 | 0.4601 | 0.11890 |
1 | 842517 | M | 20.57 | 17.77 | 132.90 | 1326.0 | 0.08474 | 0.07864 | 0.0869 | 0.07017 | … | 24.99 | 23.41 | 158.80 | 1956.0 | 0.1238 | 0.1866 | 0.2416 | 0.1860 | 0.2750 | 0.08902 |
2 | 84300903 | M | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 | 0.15990 | 0.1974 | 0.12790 | … | 23.57 | 25.53 | 152.50 | 1709.0 | 0.1444 | 0.4245 | 0.4504 | 0.2430 | 0.3613 | 0.08758 |
3 | 84348301 | M | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 | 0.28390 | 0.2414 | 0.10520 | … | 14.91 | 26.50 | 98.87 | 567.7 | 0.2098 | 0.8663 | 0.6869 | 0.2575 | 0.6638 | 0.17300 |
4 | 84358402 | M | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0.13280 | 0.1980 | 0.10430 | … | 22.54 | 16.67 | 152.20 | 1575.0 | 0.1374 | 0.2050 | 0.4000 | 0.1625 | 0.2364 | 0.07678 |
5 rows × 32 columns
canser.shape
- 输出 (569, 32)
canser.dtypes
- 输出 ID int64 Diagnosis object radius_mean float64 texture_mean float64 perimeter_mean float64 area_mean float64 smoothness_mean float64 compactness_mean float64 concavity_mean float64 concave_mean float64 symmetry_mean float64 fractal_mean float64 radius_sd float64 texture_sd float64 perimeter_sd float64 area_sd float64 smoothness_sd float64 compactness_sd float64 concavity_sd float64 concave_sd float64 symmetry_sd float64 fractal_sd float64 radius_max float64 texture_max float64 perimeter_max float64 area_max float64 smoothness_max float64 compactness_max float64 concavity_max float64 concave_max float64 symmetry_max float64 fractal_max float64 dtype: object
target = canser['Diagnosis']
data = canser.iloc[:,2:]
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.1)
# 实例化
knn = KNeighborsClassifier(n_neighbors=10)
# 数据训练
knn.fit(X_train, y_train)
- 输出 KNeighborsClassifier(algorithm=’auto’, leaf_size=30, metric=’minkowski’, metric_params=None, n_jobs=1, n_neighbors=10, p=2, weights=’uniform’)
y_ = knn.predict(X_test)
#打分
knn.score(X_test, y_test)
- 输出 0.9473684210526315
# 制作一个交叉表 cross join
# pandas 中也有交叉表 crosstab()
# margins 外边 可以用来增加一组统计的数据值
pd.crosstab(index=y_, columns=y_test, margins=True, rownames=['Predict'], colnames=['True'])
True | B | M | All |
---|---|---|---|
Predict | |||
B | 36 | 2 | 38 |
M | 1 | 18 | 19 |
All | 37 | 20 | 57 |
# 怎么提高预测的准确率
# 那么我们需要对数据清洗,数据归一化
# (item-min)/(max-min)
nd = np.array([1, 2, 3, 4, 5])
nd.max() = 5
nd.min() = 1
col = 'radius_mean'
def convert2normed(item):
# 最大值
d_max = data[col].max()
# 最小值
d_min = data[col].min()
return (item - d_min)/(d_max - d_min)
# 这里做一个覆盖
data[col] = data[col].map(convert2normed)
# 将所有的字段全部进行归一化
cols = data.columns
cols
输出
Index([‘radius_mean’, ‘texture_mean’, ‘perimeter_mean’, ‘area_mean’,
‘smoothness_mean’, ‘compactness_mean’, ‘concavity_mean’, ‘concave_mean’,
‘symmetry_mean’, ‘fractal_mean’, ‘radius_sd’, ‘texture_sd’,
‘perimeter_sd’, ‘area_sd’, ‘smoothness_sd’, ‘compactness_sd’,
‘concavity_sd’, ‘concave_sd’, ‘symmetry_sd’, ‘fractal_sd’, ‘radius_max’,
‘texture_max’, ‘perimeter_max’, ‘area_max’, ‘smoothness_max’,
‘compactness_max’, ‘concavity_max’, ‘concave_max’, ‘symmetry_max’,
‘fractal_max’],
dtype=’object’)
for col in cols:
def convert2normed(item):
# 最大值
d_max = data[col].max()
# 最小值
d_min = data[col].min()
return (item - d_min)/(d_max - d_min)
data[col] = data[col].map(convert2normed)
# 重新分割
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.1)
knn = KNeighborsClassifier(n_neighbors=15)
knn.fit(X_train, y_train)
knn.score(X_test, y_test)
输出
0.9824561403508771
利用分类绘制鸢尾花散点图
# 数据集
import sklearn.datasets as datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
iris = datasets.load_iris()
data = iris['data']
target = iris['target']
target_names = iris['target_names']
data.shape
# 在机器学习的时候。4个属性代表4种维度
输出
(150, 4)
sepal = data[:, :2]
# 这是花萼的长度
sepal_length = sepal[:, 0]
#这是花萼的宽度
sepal_width = sepal[:,1]
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
cmap = ListedColormap(['red', 'orange', 'pink'])
plt.scatter(sepal_length, sepal_width, c=target, cmap=cmap)
<matplotlib.collections.PathCollection at 0xbb56c18>
petal = data[:, 2:]
# 这是花萼的长度
petal_length = petal[:, 0]
#这是花萼的宽度
petal_width = petal[:,1]
plt.scatter(petal_length, petal_width, c=target)
<matplotlib.collections.PathCollection at 0xaf43630>
# 首先我们还是要分类
# 我们自己制造数据
# 让数据点填满整张图片
# 我们要画的点怎么区分颜色
# KNN
knn = KNeighborsClassifier()
knn.fit(sepal, target)
输出
KNeighborsClassifier(algorithm=’auto’, leaf_size=30, metric=’minkowski’,
metric_params=None, n_jobs=1, n_neighbors=5, p=2,
weights=’uniform’)
# 制造数据
# 选取一个范围
import numpy as np
x = np.linspace(4, 8.2, 200)
y = np.linspace(1.8, 4.5, 150)
# meshgrid 网格
xx,yy = np.meshgrid(x, y)
display(xx.shape, yy.shape)
- 输出
(150, 200)
(150, 200)
xx = xx.reshape(-1)
yy = yy.reshape(-1)
xy = np.c_[xx, yy]
plt.scatter(xy[:,0], xy[:,1])
<matplotlib.collections.PathCollection at 0xba0ee10>
# 预测,将我们生成的3万个点(数据)进行预测
X_test = xy
y_ = knn.predict(X_test)
# 这是刚刚30000个点的分类图
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_, cmap='rainbow')
<matplotlib.collections.PathCollection at 0xbaaec88>
# 将鸢尾花150个点也画进来
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_, cmap='rainbow')
plt.scatter(sepal_length, sepal_width, c=target, cmap=cmap)
<matplotlib.collections.PathCollection at 0xb45f0b8>
knn做线性回归
回归用于对于趋势的预测
台风预测
# 每年都要,太平洋中心就会形成台风,移动轨迹从海洋到陆地
# 中央气象台,预测,采集点,100km,风在转移的时候风速会下降
# 比如说20级,福建16级,红色预警
# 咱们要想要拿到风的运行轨迹,需要一个函数,预测风力,风力的函数
from sklearn.neighbors import KNeighborsRegressor
import numpy as np
import matplotlib.pyplot as plt
data = np.linspace(0, 10, 50)
# X, Y
# 噪点,让这个线性不那么平滑
target = np.sin(data)
target[::5] += np.random.randn(10) * 0.35
plt.scatter(data,target)
<matplotlib.collections.PathCollection at 0x9f30cc0>
data
target
# 让机器学习带有噪点的函数数据集
knn = KNeighborsRegressor()
knn.fit(data.reshape(data.size,1),target)
输出
KNeighborsRegressor(algorithm=’auto’, leaf_size=30, metric=’minkowski’,
metric_params=None, n_jobs=1, n_neighbors=5, p=2,
weights=’uniform’)
X_test = np.linspace(0, 10, 5000)
# 求解的是y
y_ = knn.predict(X_test.reshape(X_test.size, 1))
y_
输出
array([ 0.42095792, 0.42095792, 0.42095792, …, -0.17098925,
-0.17098925, -0.17098925])
# 在计算的过程中还是有点误差值的
plt.plot(X_test, y_, c='red')
plt.scatter(data, target)
<matplotlib.collections.PathCollection at 0xa41c240>
自己看理论: cuosor proxySQL(mysql的中间件) mysql8.0 与 5.7的区别
设计思想:
innodb,myisam 的区别:
事物的特性(原子性,一致性,持久性,隔离性) 三范式:分表思想
索引:主键,唯一,联合(联合索引也是普通索引),全文索引(fulltext key) (增加like的效率 ,只对myisam有效)
两张表:给你需求,让你写sql(连表查询,子查询)
存储过程:
类似于函数:写一条特别复杂的sql,连100表
函数:
触发器:
保证数据的完整性
人脸自动补全
- 给个上半部分的脸,把下半部分给自动补全
- 回归问题
- 为什么是回归问题?如果是分类 70亿
- 因为脸部有轮廓线
from sklearn.neighbors import KNeighborsRegressor
import matplotlib.pyplot as plt
import numpy as np
# 再导入几种算法
# LinearRegression 线性回归
# Ridge 山岭 岭回归
# Lasso 罗斯 罗斯回归
from sklearn.linear_model import LinearRegression, Ridge, Lasso
# y = wx
# w = y * x^-1
x = np.linspace(0,10, 100)
y = 3 * x
# 回归要求的就是w这个系数, 如果有误差值,再添加
plt.plot(x, y)
[<matplotlib.lines.Line2D at 0xc92b780>]
# 引入人脸的数据集
import sklearn.datasets as datasets
# 提取数据集
face = datasets.fetch_olivetti_faces()
face
data = face['data']
data.shape
输出
(400, 4096)
import math
math.sqrt(data.shape[1])
输出
64.0
# 人脸, 上下分开, 切片
# 先切上半部分的脸
face_up = data[:,:2048]
# 下半部
face_down = data[:,2048:]
# 怎么样才可以随机取图
index = np.random.randint(0, 400,size=1)
plt.imshow(data[index].reshape(64, 64),cmap='gray')
<matplotlib.image.AxesImage at 0xc7f5da0>
axes = plt.subplot(121)
axes.imshow(face_up[index].reshape(32, 64), cmap='gray')
axes1 = plt.subplot(122)
axes1.imshow(face_down[index].reshape(32, 64), cmap='gray')
<matplotlib.image.AxesImage at 0xb9ecf28>
# 分割数据
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(face_up, face_down, test_size=0.02)
# 让四个算法同时计算,比较哪个更准确
# estimator 估计器
# 将4中算法保存在字典中
estimator = {
'KNN': KNeighborsRegressor(),
'LinearRG':LinearRegression(),
'Ridge': Ridge(),
'Lasso':Lasso()
}
# 循环训练
result_ = dict()
for key, estimator in estimator.items():
# 开始训练
estimator.fit(X_train, y_train)
# 预测
y_ = estimator.predict(X_test)
# 将各个算法的结果保存起来
result_[key] = y_
# 测试knn预测的结果
knn_y = result_['KNN']
face_true = np.concatenate([X_test[0], y_test[0]]).reshape(64, 64)
plt.imshow(face_true, cmap='gray')
<matplotlib.image.AxesImage at 0x114a1fd0>
result_
axes = plt.subplot(121)
axes.imshow(face_true, cmap='gray')
# 取KNN的预测图
face_knn_predict = np.concatenate([X_test[0], knn_y[0]]).reshape(64, 64)
axes1 = plt.subplot(122)
axes1.imshow(face_knn_predict, cmap='gray')
<matplotlib.image.AxesImage at 0x116c6dd8>
# 将所有的算法图片全部展示, 原图, 上半部分脸也展示, 6张图片
plt.figure(figsize=(6 * 2, 8 *2))
for i in range(8):
# subplot
axes = plt.subplot(8, 6, 1+ 6 * i)
# 第一张图为真实的脸
axes.axis('off')
face_true = np.concatenate([X_test[i], y_test[i]]).reshape(64, 64)
axes.imshow(face_true, cmap='gray')
if i == 0:
axes.set_title('True')
# 第二行为上半部分脸
axes = plt.subplot(8, 6, 2+ 6 * i)
axes.axis('off')
face_up = X_test[i].reshape(32, 64)
axes.imshow(face_up, cmap='gray')
if i == 0:
axes.set_title('UP')
# 打印算法的脸
for j, key in enumerate(result_):
y_ = result_[key]
face_predict = np.concatenate([X_test[i], y_[i]]).reshape(64, 64)
axes = plt.subplot(8, 6, 3 + 6 * i + j)
axes.imshow(face_predict, cmap='gray')
axes.axis('off')
if i == 0:
axes.set_title(key)