IT数码 购物 网址 头条 软件 日历 阅读 图书馆
TxT小说阅读器
↓语音阅读,小说下载,古典文学↓
图片批量下载器
↓批量下载图片,美女图库↓
图片自动播放器
↓图片自动播放器↓
一键清除垃圾
↓轻轻一点,清除系统垃圾↓
开发: C++知识库 Java知识库 JavaScript Python PHP知识库 人工智能 区块链 大数据 移动开发 嵌入式 开发工具 数据结构与算法 开发测试 游戏开发 网络协议 系统运维
教程: HTML教程 CSS教程 JavaScript教程 Go语言教程 JQuery教程 VUE教程 VUE3教程 Bootstrap教程 SQL数据库教程 C语言教程 C++教程 Java教程 Python教程 Python3教程 C#教程
数码: 电脑 笔记本 显卡 显示器 固态硬盘 硬盘 耳机 手机 iphone vivo oppo 小米 华为 单反 装机 图拉丁
 
   -> 人工智能 -> 机器学习Sklearn实战——KNN算法 -> 正文阅读

[人工智能]机器学习Sklearn实战——KNN算法

KNN鸢尾花分类

import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn import datasets
import numpy as np
X,y = datasets.load_iris(True)    #返回x、y
X = X[:,:2]
plt.scatter(X[:,0],X[:,1],c=y)
knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X,y)
x1 = np.linspace(4,8,100)   #横坐标4到8
y1 = np.linspace(2,4.5,80)    #纵坐标2到4.5
X1,Y1 = np.meshgrid(x1,y1)
X1 =X1.reshape(-1,1)
Y1 =Y1.reshape(-1,1)
X_test = np.concatenate([X1,Y1],axis = 1)     #shape为(8000,2)

from matplotlib.colors import ListedColormap
lc1 = ListedColormap(["#FFAAAA","#AAFFAA","#AAAAFF"])
lc2 = ListedColormap(["#FF0000","#00FF00","#0000FF"])
y_ = knn.predict(X_test)
plt.scatter(X_test[:,0],X_test[:,1], c=y_ ,cmap=lc1)
plt.scatter(X[:,0],X[:,1], c=y,cmap=lc2)

KNN参数的筛选

import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn import datasets
from sklearn.model_selection import cross_val_score

X,y = datasets.load_iris(True)
knn = KNeighborsClassifier()
score = cross_val_score(knn,X,y,scoring="accuracy",cv=6)
print(score)

erros = []
for i in range(1,14):        #150开根号
    knn = KNeighborsClassifier(n_neighbors=i)
    score = cross_val_score(knn,X,y,scoring="accuracy",cv=6).mean()
    erros.append(1-score)
import matplotlib.pyplot as plt
plt.plot(np.arange(1,14),erros)

weights = ["uniform","distance"]
for w in weights:
    knn = KNeighborsClassifier(n_neighbors = 12,weights = w)
    print(cross_val_score(knn,X,y,scoring="accuracy",cv = 6).mean())
0.98
0.9733333333333333
result = {}
for k in range(1,14):
    for w in weights:
        knn = KNeighborsClassifier(n_neighbors = k,weights = w)
        sm = cross_val_score(knn,X,y,scoring="accuracy",cv = 6).mean()
        result[w + str(k)] = sm
result
{'uniform1': 0.96,
 'distance1': 0.96,
 'uniform2': 0.94,
 'distance2': 0.96,
 'uniform3': 0.9666666666666667,
 'distance3': 0.9666666666666667,
 'uniform4': 0.9666666666666667,
 'distance4': 0.9666666666666667,
 'uniform5': 0.9666666666666667,
 'distance5': 0.9666666666666667,
 'uniform6': 0.9666666666666667,
 'distance6': 0.96,
 'uniform7': 0.9733333333333333,
 'distance7': 0.9733333333333333,
 'uniform8': 0.9666666666666667,
 'distance8': 0.9666666666666667,
 'uniform9': 0.9733333333333333,
 'distance9': 0.9733333333333333,
 'uniform10': 0.96,
 'distance10': 0.96,
 'uniform11': 0.9733333333333333,
 'distance11': 0.9733333333333333,
 'uniform12': 0.98,
 'distance12': 0.9733333333333333,
 'uniform13': 0.9733333333333333,
 'distance13': 0.9733333333333333}
np.array(list(result.values())).argmax()
list(result)[22]
22
'uniform12'

KNN癌症诊断

import numpy as np 
import pandas as pd
from pandas import Series,DataFrame
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

cancer = pd.read_csv("/Users/zhucan/Desktop/cancer.csv",sep = "\t")
cancer.drop("ID",axis = 1, inplace=True)
X = cancer.iloc[:,1:]
y = cancer["Diagnosis"]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)
knn = KNeighborsClassifier()
params = {"n_neighbors":[i for i in range(1,30)],"weights":["distance","uniform"],"p":[1,2]}
gcv = GridSearchCV(knn,params,scoring = "accuracy",cv = 6)
gcv.fit(X_train,y_train)
gcv.best_estimator_
gcv.best_score_
gcv.best_params_
y_ = gcv.predict(X_test)
gcv.score(X_test,y_test)       #此时的gcv就是gcv.best_estimator_
pd.crosstab(index = y_test,columns = y_,rownames=["True"],colnames=["Predict"]) 
#输出混淆矩阵
KNeighborsClassifier(n_neighbors=4, p=1, weights='distance')
0.9516666666666667
{'n_neighbors': 4, 'p': 1, 'weights': 'distance'}
0.9385964912280702
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
confusion_matrix(y_,y_test)
print(classification_report(y_test,y_,target_names = ["B","M"]))
 

78/(78+5)?= 0.94? ? ? 78/(78+2) = 0.97

29/(29+2)?= 0.94? ? ?29/(29+5) = 0.85

找健康的比找生病的要强

KNN数据归一化操作?

#归一化操作
X_norm1 = (X-X.min())/(X.max()-X.min())
X_train,X_test,y_train,y_test = train_test_split(X_norm1,y,test_size = 0.2)
knn = KNeighborsClassifier()
params = {"n_neighbors":[i for i in range(1,30)],"weights":["distance","uniform"],"p":[1,2]}
gcv = GridSearchCV(knn,params,scoring = "accuracy",cv = 6)
gcv.fit(X_train,y_train)
from sklearn.metrics import accuracy_score
y_ = gcv.predict(X_test)
accuracy_score(y_test,y_)
0.9649122807017544
#另外的方法
#标准化
X_norm2 = (X - X.mean())/X.std()

from sklearn.preprocessing import MinMaxScaler,StandardScaler
mms = MinMaxScaler()
mms.fit(X)
X2 = mms.transform(X)      #和归一化效果一样

ss = StandardScaler()
X3=ss.fit_transform(X)
X3                   #和标准化效果一样

sklearn中数据拆分

import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score,GridSearchCV
from sklearn.model_selection import KFold,StratifiedKFold #KFold、StratifiedKFold将数据分成多少份

data = np.random.randint(0,10,size=(8,2))
target = np.array([0,0,1,0,1,1,1,0])
# train,test是索引,只要有索引就可以获取数据
KFold = KFold(n_splits=4)
for train,test in KFold.split(data,target):
    print(target[train],target[test])
[1 0 1 1 1 0] [0 0]
[0 0 1 1 1 0] [1 0]
[0 0 1 0 1 0] [1 1]
[0 0 1 0 1 1] [1 0]
#分成4分,每一份数据特征,数据样本比例和原来一样
sKFold = StratifiedKFold(n_splits=4)
for train,test in sKFold.split(data,target):
    print(target[train],target[test])
[0 0 1 1 1 0] [0 1]
[0 1 0 1 1 0] [0 1]
[0 0 1 1 1 0] [0 1]
[0 0 1 0 1 1] [1 0]

#train_test_split,KFold,StratifiedKFold作用都是将数据拆分?

str类型数据的转变与训练预测

data = pd.read_csv("/Users/zhucan/Desktop/salary.txt")
data.drop(labels=["final_weight","education","capital_gain","capital_loss"],axis = 1,inplace=True)
X = data.iloc[:,0:-1]
y = data.iloc["salary"]
#方法将数据中str转换int,float从而算法可以计算
#map方法,apply,transform
u = X["workclass"].unique()
u
array(['State-gov', 'Self-emp-not-inc', 'Private', 'Federal-gov',
       'Local-gov', '?', 'Self-emp-inc', 'Without-pay', 'Never-worked'],
      dtype=object)
np.argwhere(u=='Local-gov')[0,0]
4
def convert(x):                 #利用数字进行映射     
    return np.argwhere(u==x)[0,0]
X["workclass"]=X["workclass"].map(convert)

cols = ['marital_status', 'occupation','relationship', 'race', 'sex','native_country']
for col in cols:
    u = X[col].unique()
    def convert(x):           
        return np.argwhere(u==x)[0,0]
    X[col] = X[col].map(convert)
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score,GridSearchCV
from sklearn.model_selection import KFold,StratifiedKFold

data = pd.read_csv("/Users/zhucan/Desktop/salary.txt")
data.head()
data.drop(labels=["final_weight","education","capital_gain","capital_loss"],axis = 1,inplace=True)

X = data.iloc[:,0:-1]
y = data["salary"]

u = X["workclass"].unique()
def convert(x):                
    return np.argwhere(u==x)[0,0]
X["workclass"]=X["workclass"].map(convert)

cols = ['marital_status','occupation','relationship','race','sex','native_country']
for col in cols:
    u = X[col].unique()
    def convert(x):                
        return np.argwhere(u==x)[0,0]
    X[col] = X[col].map(convert)

knn = KNeighborsClassifier()
kFold = KFold(10)
knn = KNeighborsClassifier()
accuracy = 0
for train,test in kFold.split(X,y):
    knn.fit(X.loc[train],y[train])
    acc = knn.score(X.loc[test],y[test])
    accuracy += acc/10
print(accuracy)
0.7973345728987424
  人工智能 最新文章
2022吴恩达机器学习课程——第二课(神经网
第十五章 规则学习
FixMatch: Simplifying Semi-Supervised Le
数据挖掘Java——Kmeans算法的实现
大脑皮层的分割方法
【翻译】GPT-3是如何工作的
论文笔记:TEACHTEXT: CrossModal Generaliz
python从零学(六)
详解Python 3.x 导入(import)
【答读者问27】backtrader不支持最新版本的
上一篇文章      下一篇文章      查看所有文章
加:2021-10-25 12:32:33  更:2021-10-25 12:34:35 
 
开发: C++知识库 Java知识库 JavaScript Python PHP知识库 人工智能 区块链 大数据 移动开发 嵌入式 开发工具 数据结构与算法 开发测试 游戏开发 网络协议 系统运维
教程: HTML教程 CSS教程 JavaScript教程 Go语言教程 JQuery教程 VUE教程 VUE3教程 Bootstrap教程 SQL数据库教程 C语言教程 C++教程 Java教程 Python教程 Python3教程 C#教程
数码: 电脑 笔记本 显卡 显示器 固态硬盘 硬盘 耳机 手机 iphone vivo oppo 小米 华为 单反 装机 图拉丁

360图书馆 购物 三丰科技 阅读网 日历 万年历 2024年11日历 -2024/11/27 8:43:09-

图片自动播放器
↓图片自动播放器↓
TxT小说阅读器
↓语音阅读,小说下载,古典文学↓
一键清除垃圾
↓轻轻一点,清除系统垃圾↓
图片批量下载器
↓批量下载图片,美女图库↓
  网站联系: qq:121756557 email:121756557@qq.com  IT数码