Implementation of KNN and Naive_Bayes

Implement kNN and Naive Bayes using Breast_cancer dataset.

KNN

Import dataset

# 导入数据集
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import neighbors

datasets = datasets.load_breast_cancer()
X = datasets.data;
y = datasets.target;
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
k = 5
# print(datasets.DESCR)
# malignant - 0, benign - 1
y_predict = []

KNN implementation

def knn(X_train, y_train, X_test, y_predict):
    '''
    对测试集的数据进行预测，得到的结果与y_test比较。用欧式距离进行计算。
    '''
    for test_data in X_test:
        first_k_instance = []
        for i in range(len(X_train)):
            distance = 0;
            for attributes_no in range(len(X_train[0])):
                distance += (test_data[attributes_no] - X_train[i][attributes_no]) ** 2
            Euclid_distance = distance ** 0.5
            #print(Euclid_distance)
            if i < k:
                first_k_instance.append((i, Euclid_distance))
            elif Euclid_distance < first_k_instance[k-1][1]:
                first_k_instance[k-1] = (i, Euclid_distance)
            first_k_instance = sorted(first_k_instance, key = lambda x:x[1]) 
            #print(first_k_instance)
        # 现在得到了距离测试点最近的k个点，用多数表决器来判断测试点是良性还是恶性
        benign = 0
        malignant = 0
        for instance in first_k_instance:
            if y_train[instance[0]] == 0:
                malignant += 1
            else:
                benign += 1
        if malignant >= benign:
            y_predict.append(0)
        else:
            y_predict.append(1)

Calculate accuracy

def accuracy(y_predict, y_test):
    correct = 0
    for i in range(len(y_predict)):
        if y_predict[i] == y_test[i]:
            correct += 1
    accuracy_rate = correct / len(y_predict)
    return correct, accuracy_rate

Main fuction

def main():
    knn(X_train, y_train, X_test, y_predict)
    correct, accuracy_rate = accuracy(y_predict, y_test)
    print(y_predict)
    print("kNN模型测试集预测的准确率为：%.3f" % accuracy_rate);
    KNN = neighbors.KNeighborsClassifier(n_neighbors = 5)
    KNN.fit(X_train, y_train)
    print("sklearn库中kNN模型预测的准确率为：%.3f" % KNN.score(X_test, y_test));
    
if __name__ == '__main__':
    main()

[0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1]
The prediction accuracy of kNN model test set is: 0.947
The prediction accuracy of kNN model in sklearn library is: 0.947

Through the results, it can be found that the kNN we implemented is consistent with the kNN provided in sklearn.

We can further optimize the accuracy by setting the value of k and transforming the strategy of finding similar samples (replacing the Euclidean distance with the matching coefficient or Jaccard, etc.).

Naive_Bayes

Import Dataset

# load datasets
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import naive_bayes

datasets = datasets.load_breast_cancer()
X = datasets.data;
y = datasets.target;
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
#print(datasets.DESCR)
#malignant - 0, benign - 1
y_predict = []

Since all the 30 attributes are continuous values, we need to divide the range of attribute values into several intervals when using Naive Bayes, and calculate the probability of the instance falling in this interval. I’m dividing each property here by the average.

Divide consecutive attributes into intervals

def distribution(X_train, y_train):
    '''
    先把区间分好，然后再计算概率。
    '''
    #===============区间划分====================#
    attributes_max_min_mean = []
    # 记录所有属性的最大值、最小值和平均值
    for i in range(len(X_train[0])):
        #属性循环
        #section = [max, min, mean]
        section = [X_train[0][i], X_train[0][i], 0]
        for instance in X_train:
            #训练样例循环
            if instance[i] > section[0]:
                section[0] = instance[i]
            if instance[i] < section[1]:
                section[1] = instance[i]
            section[2] += instance[i]
        section[2] /= len(X_train)
        attributes_max_min_mean.append(section)
        
    #=========计算每个属性落在每个区间的样例个数=========#
    instance_distribution = []
    for i in range(len(X_train[0])):
        #属性循环
        smaller_benign = 0
        larger_benign = 0
        smaller_malignant = 0
        larger_malignant = 0
        for j in range(len(X_train)):
            #训练样例循环
            if X_train[j][i] > attributes_max_min_mean[i][2]:
                if y_train[j] == 1:
                    larger_benign += 1
                else:
                    larger_malignant +=1
            elif y_train[j] == 1:
                smaller_benign += 1
            else:
                smaller_malignant += 1   
        instance_distribution.append([smaller_benign, larger_benign, smaller_malignant, larger_malignant])
        
    return instance_distribution, attributes_max_min_mean

Implementation

def Naive_Bayes(X_test, y_predict, instance_distribution,attributes_max_min_mean):
    for test_data in X_test:
        #测试样例循环
        #训练集中良性和恶性肿瘤的数量
        malignant = instance_distribution[0][2] + instance_distribution[0][3]
        benign = instance_distribution[0][0] + instance_distribution[0][1]
        #概率初始化，下面计算每个属性的概率
        p_xc0 = 1
        p_xc1 = 1
        for i in range(len(X_train[0])):
            # 属性循环
            if test_data[i] > attributes_max_min_mean[i][2]:
                p_xc0 *= instance_distribution[i][3] / malignant
                p_xc1 *= instance_distribution[i][1] / benign
            else:
                p_xc0 *= instance_distribution[i][2] / malignant
                p_xc1 *= instance_distribution[i][0] / benign
        p0 = p_xc0 * malignant / (malignant + benign)
        p1 = p_xc1 * benign / (malignant + benign)
        if p0 > p1:
            y_predict.append(0)
        else:
            y_predict.append(1)

Calculate accuracy

def accuracy(y_predict, y_test):
    correct = 0
    for i in range(len(y_predict)):
        if y_predict[i] == y_test[i]:
            correct += 1
    accuracy_rate = correct / len(y_predict)
    return correct, accuracy_rate

Main function

def main():
    instance_distribution, attributes_max_min_mean = distribution(X_train, y_train)
    Naive_Bayes(X_test, y_predict, instance_distribution, attributes_max_min_mean)
    correct, accuracy_rate = accuracy(y_predict, y_test)
    print(y_predict)
    print("Naive Bayes模型测试集预测的准确率为：%.3f" % accuracy_rate);
    bayes = naive_bayes.GaussianNB()
    bayes.fit(X_train, y_train)
    print("sklearn库中Naive Bayes模型预测的准确率为：%.3f" % bayes.score(X_test, y_test));
    
if __name__ == '__main__':
    main()

[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1]
The predicted accuracy of Naive Bayes model test set is: 0.930
The prediction accuracy of Naive Bayes model in sklearn library is: 0.924

Through the experimental results, it can be found that the naive Bayes we implemented is better than the naive Bayes provided by sklearn.

We can further optimize the accuracy by trying different interval divisions for each attribute. The reason why the naive Bayes provided by sklearn may not work well is that the interval partition to convert continuous values to discrete values is not done well.

This work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License