python实现简单分类knn算法

原理:计算当前点(无label,一般为测试集)和其他每个点(有label,一般为训练集)的距离并升序排序,选取k个最小距离的点,根据这k个点对应的类别进行投票,票数最多的类别的即为该点所对应的类别。
代码实现(数据集采用的是iris):

 1 import numpy as np
 2 from sklearn.datasets import load_iris
 3 from sklearn.model_selection import train_test_split
 4 from sklearn import neighbors
 5 from sklearn.metrics import accuracy_score
 6 
 7 def get_iris():
 8     iris_data = load_iris()
 9     X_train, X_test, y_train, y_test = train_test_split(iris_data.data, iris_data.target, test_size=0.4, random_state=0)
10     return X_train, X_test, y_train, y_test
11 
12 def knn_classify(self_point, dataset, labels, k):
13     distance = [np.sqrt(sum((self_point - d)**2)) for d in dataset]
14     train_data = zip(distance, labels)
15     train_data = sorted(train_data, key=lambda x: x[0])[:k]
16     self_label = {}
17     for i in train_data:
18         i = str(i[1])
19         self_label[i] = self_label.setdefault(i, 0) + 1
20     self_label = sorted(self_label, key=self_label.get, reverse=True)
21     return self_label[0]
22 
23 
24 X_train, X_test, y_train, y_test = get_iris()
25 size = len(y_test)
26 count = 0
27 for t in range(len(X_test)):
28     y_pre = knn_classify(X_test[t], X_train, y_train, 5)
29     if y_pre == str(y_test[t]):
30         count += 1
31 print('custom的准确率: ', count / size)
32 
33 # 使用sklearn内置的KNN
34 knn = neighbors.KNeighborsClassifier(n_neighbors=5)
35 knn.fit(X_train, y_train)
36 pre = knn.predict(X_test)
37 print('sklearn的准确率: ', accuracy_score(y_test, pre))

对比结果:
custom的准确率: 0.95
sklearn的准确率: 0.95