Posted on 

Fisher, SVM, K-Means and their optimization

Implement Fisher, SVM and K-Means algorithms and optimize them.

Fisher algorithm and its optimization

Implement Fisher Algorithm and validate by random generating data.

Generating data

from sklearn.datasets import make_multilabel_classification
import numpy as np

x, y = make_multilabel_classification(n_samples=200, n_features=2,
n_labels=1, n_classes=1,
random_state=2)

# 根据类别分类
index1 = np.array([index for (index, value) in enumerate(y) if value == 0]) # 获取类别1的indexs
index2 = np.array([index for (index, value) in enumerate(y) if value == 1]) # 获取类别2的indexs

c_1 = x[index1] # 类别1的所有数据(x1, x2) in X_1
c_2 = x[index2] # 类别2的所有数据(x1, x2) in X_2

make_multilabel_classification
n_samples: the number of samples
n_features: the number of features.
n_labels: the number of labels.
n_classes: the number of classes.
random_state: set the random seed to ensure generating the same data everytime.

enumerate()
enumerate() is used to combine a traversable data object (such as a list, tuple, or string) into an index sequence, listing both the data and the data subscripts, typically in a for loop.

The implementation of Fisher

def cal_cov_and_avg(samples):
u1 = np.mean(samples, axis=0)
cov_m = np.zeros((samples.shape[1], samples.shape[1]))
for s in samples:
t = s - u1
cov_m += t * t.reshape(2, 1)
return cov_m, u1


def fisher(c_1, c_2):
cov_1, u1 = cal_cov_and_avg(c_1)
cov_2, u2 = cal_cov_and_avg(c_2)
s_w = cov_1 + cov_2
u, s, v = np.linalg.svd(s_w) # 奇异值分解
s_w_inv = np.dot(np.dot(v.T, np.linalg.inv(np.diag(s))), u.T)
return np.dot(s_w_inv, u1 - u2)

np.mean: calculate the average on the specified axis
np.zeros: generate an array of the given shape and filled with zeros.

Determine the category

def judge(sample, w, c_1, c_2):
u1 = np.mean(c_1, axis=0)
u2 = np.mean(c_2, axis=0)
center_1 = np.dot(w.T, u1)
center_2 = np.dot(w.T, u2)
pos = np.dot(w.T, sample)
return abs(pos - center_1) < abs(pos - center_2)


w = fisher(c_1, c_2) # 调用函数,得到参数w
out = judge(c_1[1], w, c_1, c_2) # 判断所属的类别
# print(out)

Plot

import matplotlib.pyplot as plt

plt.scatter(c_1[:, 0], c_1[:, 1], c='#99CC99')
plt.scatter(c_2[:, 0], c_2[:, 1], c='#FFCC00')
line_x = np.arange(min(np.min(c_1[:, 0]), np.min(c_2[:, 0])),
max(np.max(c_1[:, 0]), np.max(c_2[:, 0])),
step=1)

line_y = - (w[0] * line_x) / w[1]
plt.plot(line_x, line_y)
plt.show()
<Figure size 640x480 with 1 Axes>

The detailed derivation of SVM optimization dual problem

reference: https://zhuanlan.zhihu.com/p/49331510





The implementation of SVM

reference: https://www.jb51.net/article/131580.htm

from sklearn import svm 
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(0)
x = np.r_[np.random.randn(100,2)-[2,2],np.random.randn(100,2)+[2,2]] #正态分布来产生数字,20行2列*2
y = [0]*100+[1]*100 #100个class0,100个class1

clf = svm.SVC(kernel='linear')
clf.fit(x,y)

w = clf.coef_[0] #获取w
a = -w[0]/w[1] #斜率
#画图划线
xx = np.linspace(-5,5) #(-5,5)之间x的值
yy = a*xx-(clf.intercept_[0])/w[1] #xx带入y,截距

#画出与点相切的线
b = clf.support_vectors_[0]
yy_down = a*xx+(b[1]-a*b[0])
b = clf.support_vectors_[-1]
yy_up = a*xx+(b[1]-a*b[0])

print("W:",w)
print("a:",a)

print("support_vectors_:",clf.support_vectors_)
print("clf.coef_:",clf.coef_)

plt.figure(figsize=(8,4))
plt.plot(xx,yy)
plt.plot(xx,yy_down)
plt.plot(xx,yy_up)
plt.scatter(clf.support_vectors_[:,0],clf.support_vectors_[:,1],s=80)
plt.scatter(x[:,0],x[:,1],c=y,cmap=plt.cm.Paired) #[:,0]列切片,第0列

plt.axis('tight')

plt.show()
W: [0.95070185 1.15607502]
a: -0.8223530762163854
support_vectors_: [[-0.51174781 -0.10411082]
 [ 0.16323595 -0.66347205]
 [ 2.39904635 -0.77259276]
 [ 0.66574153  0.65328249]
 [-0.25556423  0.97749316]]
clf.coef_: [[0.95070185 1.15607502]]

png

The implementation of k-means

Reference1: https://cloud.tencent.com/developer/article/1465020
Reference2: https://blog.csdn.net/weixin_42029738/article/details/81978038

'''
K-Means clustering algorithms

'''

print(__doc__)

import time

import numpy as np
import matplotlib.pyplot as plt

from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.metrics.pairwise import pairwise_distances_argmin
from sklearn.datasets.samples_generator import make_blobs

# #############################################################################
# Generate sample data
np.random.seed(0)

batch_size = 45
centers = [[1, 1], [-1, -1], [1, -1]] # 初始化3个中心
n_clusters = len(centers) # 聚类的数目为3
# 产生10000组二维数据,以上面三个点为中心,以(-10,10)为边界,数据集的标准差是0.7
X, labels_true = make_blobs(n_samples=10000, centers=centers, cluster_std=0.7)

# #############################################################################
# Compute clustering with Means

k_means = KMeans(init='k-means++', n_clusters=3, n_init=10)
t0 = time.time()
k_means.fit(X)
# 使用k-means对300组数据集训练算法的时间消耗
t_batch = time.time() - t0

# #############################################################################
# Plot result

fig = plt.figure(figsize=(8, 3))
fig.subplots_adjust(left=0.02, right=0.98, bottom=0.05, top=0.9)
colors = ['#4EACC5', '#FF9C34', '#4E9A06']

# We want to have the same colors for the same cluster from the
# MiniBatchKMeans and the KMeans algorithm. Let's pair the cluster centers per
# closest one.
k_means_cluster_centers = np.sort(k_means.cluster_centers_, axis=0)
k_means_labels = pairwise_distances_argmin(X, k_means_cluster_centers)

# KMeans
ax = fig.add_subplot(1, 3, 1)
for k, col in zip(range(n_clusters), colors):
my_members = k_means_labels == k
cluster_center = k_means_cluster_centers[k]
ax.plot(X[my_members, 0], X[my_members, 1], 'w',
markerfacecolor=col, marker='.')
ax.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
markeredgecolor='k', markersize=6)
ax.set_title('KMeans')
ax.set_xticks(())
ax.set_yticks(())
plt.text(-3.5, 1.8, 'train time: %.2fs\ninertia: %f' % (
t_batch, k_means.inertia_))


plt.show()
    K-Means clustering algorithms

png

The optimization of k-means

1. `k-means++`(Change the way you choose the center point)
2. `elkan K-Means`(Reduce unnecessary distance calculations)
3. `ISODATA` Algorithm (Adjust the number of clustering centers K according to the actual situation during operation)
4. `Mini Batch k-means` Algorithm (Adopting partial samples and discarding some accuracy greatly accelerates the convergence speed)

k-means++

Reference1: https://blog.csdn.net/github_39261590/article/details/76910689
Reference2: https://www.cnblogs.com/yszd/p/9672885.html
The basic idea of the algorithm for selecting the initial seeds is: The initial cluster centers should be as far away from each other as possible.

Algorithm steps:

  1. Select a point randomly from the set of input data points as the first clustering center

  2. For each point x in the data set, calculate the distance D(x) between it and the nearest cluster center (referring to the selected cluster center).

  3. Select a new data point as the new cluster center. The selection principle is as follows: points with larger D(x) have a greater probability of being selected as the cluster center

  4. Repeat 2 and 3 until K cluster centers are selected

  5. Use these K initial cluster centers to run the standard k-means algorithm

import numpy as np
import matplotlib.pyplot as plt
import sklearn.datasets as ds
import matplotlib.colors
from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans

def expand(a, b):
d = (b - a) * 0.1
return a-b, b+d

if __name__ == "__main__":
N = 400
centers = 4
data, y = ds.make_blobs(N, n_features=2, centers=centers, random_state=2)
data2, y2 = ds.make_blobs(N, n_features=2, centers=centers, cluster_std=(1, 2.5, 0.5, 2), random_state=2)
# 按行拼接numpy数组
data3 = np.vstack((data[y == 0][:], data[y == 1][:50], data[y == 2][:20], data[y == 3][:5]))
y3 = np.array([0] * 100 + [1] * 50 + [2] * 20 + [3] * 5)
cls = KMeans(n_clusters=4, init='k-means++')
y_hat = cls.fit_predict(data)
y2_hat = cls.fit_predict(data2)
y3_hat = cls.fit_predict(data3)

m = np.array(((1, 1),(1, 3)))
data_r = data.dot(m)
y_r_hat = cls.fit_predict(data_r)

matplotlib.rcParams['font.sans-serif'] = [u'SimHei']
matplotlib.rcParams['axes.unicode_minus'] = False
cm = matplotlib.colors.ListedColormap(list('rgbm'))
plt.figure(figsize=(9, 10), facecolor='w')
plt.subplot(421)
plt.title(u'原始数据')
plt.scatter(data[:, 0], data[:, 1], c=y, s=30, cmap=cm, edgecolors='none')
x1_min, x2_min = np.min(data, axis=0)
x1_max, x2_max = np.max(data, axis=0)
x1_min, x1_max = expand(x1_min, x1_max)
x2_min, x2_max = expand(x2_min, x2_max)
plt.xlim((x1_min, x1_max))
plt.ylim((x2_min, x2_max))
plt.grid(True)

plt.subplot(422)
plt.title(u'KMeans++聚类')
plt.scatter(data[:, 0], data[:, 1], c=y_hat, s=30, cmap=cm, edgecolors='none')
plt.xlim((x1_min, x1_max))
plt.ylim((x2_min, x2_max))
plt.grid(True)

png

elkan k-means

Reference: https://blog.csdn.net/u014465639/article/details/71342072

Elkan k-means uses the triangle property that the sum of the two sides is greater than or equal to the third side, and the difference between the two sides is less than the third side to reduce the distance calculation.

The first rule is for a sample point and two centers of mass. If we pre-calculate the distance between these two centers of mass, then if we find it, we know it immediately. Now we don’t have to calculate it anymore, so we’re saving a step.

The second rule is for a sample point and two centers of mass. We can get. This is also easy to get from the triangle property.

Using the above two rules, ELkan k-means has a great improvement in iteration speed compared with traditional k-means. However, if the features of our samples are sparse and there are missing values, this method will not be used. In this case, some distances cannot be calculated, so this algorithm cannot be used.

ISODATA

Reference1: https://blog.csdn.net/houston11235/article/details/8511379
Reference2: https://www.cnblogs.com/huadongw/p/4101422.html
One disadvantage of k-means is that you have to specify the number of clusters, which sometimes doesn’t work very well. Therefore, it is required that the number of this category can also be changed, which forms the ISOData method. By setting some conditions for category splitting and merging, the number of categories can be automatically increased or decreased in the process of clustering. The problem with this, of course, is that this condition is not always easy to give. Of course, ISODATA can still get more reliable results in many cases.

Mini Batch k-means

Reference1: https://cloud.tencent.com/developer/article/1465020
Mini Batch KMeans algorithm is a clustering model ** that can keep the clustering accuracy as far as possible but can greatly reduce the computation time. It uses small batches of data subsets to reduce the computation time while still trying to optimize the objective function. The so-called Mini Batch KMeans algorithm refers to the data subset randomly selected during each training algorithm. Using these randomly selected data for training greatly reduces the computing time and reduces the convergence time of KMeans algorithm, but it is slightly worse than the standard algorithm. It is suggested that when the sample size is more than ten thousand for clustering, it is necessary to consider the selection of Mini Batch KMeans algorithm.

'''
Comparison of the K-Means and MiniBatchKMeans clustering algorithms

'''

print(__doc__)

import time

import numpy as np
import matplotlib.pyplot as plt

from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.metrics.pairwise import pairwise_distances_argmin
from sklearn.datasets.samples_generator import make_blobs

# #############################################################################
# Generate sample data
np.random.seed(0)

batch_size = 45
centers = [[1, 1], [-1, -1], [1, -1]] # 初始化3个中心
n_clusters = len(centers) # 聚类的数目为3
# 产生10000组二维数据,以上面三个点为中心,以(-10,10)为边界,数据集的标准差是0.7
X, labels_true = make_blobs(n_samples=10000, centers=centers, cluster_std=0.7)

# #############################################################################
# Compute clustering with Means

k_means = KMeans(init='k-means++', n_clusters=3, n_init=10)
t0 = time.time()
k_means.fit(X)
# 使用k-means对300组数据集训练算法的时间消耗
t_batch = time.time() - t0

# #############################################################################
# Compute clustering with MiniBatchKMeans

mbk = MiniBatchKMeans(init='k-means++', n_clusters=3, batch_size=batch_size,
n_init=10, max_no_improvement=10, verbose=0)
t0 = time.time()
mbk.fit(X)
# 使用MiniBatchKMeans对300组数据集训练算法的时间消耗
t_mini_batch = time.time() - t0

# #############################################################################
# Plot result

fig = plt.figure(figsize=(8, 3))
fig.subplots_adjust(left=0.02, right=0.98, bottom=0.05, top=0.9)
colors = ['#4EACC5', '#FF9C34', '#4E9A06']

# We want to have the same colors for the same cluster from the
# MiniBatchKMeans and the KMeans algorithm. Let's pair the cluster centers per
# closest one.
k_means_cluster_centers = np.sort(k_means.cluster_centers_, axis=0)
mbk_means_cluster_centers = np.sort(mbk.cluster_centers_, axis=0)
k_means_labels = pairwise_distances_argmin(X, k_means_cluster_centers)
mbk_means_labels = pairwise_distances_argmin(X, mbk_means_cluster_centers)
order = pairwise_distances_argmin(k_means_cluster_centers,
mbk_means_cluster_centers)

# KMeans
ax = fig.add_subplot(1, 3, 1)
for k, col in zip(range(n_clusters), colors):
my_members = k_means_labels == k
cluster_center = k_means_cluster_centers[k]
ax.plot(X[my_members, 0], X[my_members, 1], 'w',
markerfacecolor=col, marker='.')
ax.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
markeredgecolor='k', markersize=6)
ax.set_title('KMeans')
ax.set_xticks(())
ax.set_yticks(())
plt.text(-3.5, 1.8, 'train time: %.2fs\ninertia: %f' % (
t_batch, k_means.inertia_))

# MiniBatchKMeans
ax = fig.add_subplot(1, 3, 2)
for k, col in zip(range(n_clusters), colors):
my_members = mbk_means_labels == order[k]
cluster_center = mbk_means_cluster_centers[order[k]]
ax.plot(X[my_members, 0], X[my_members, 1], 'w',
markerfacecolor=col, marker='.')
ax.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
markeredgecolor='k', markersize=6)
ax.set_title('MiniBatchKMeans')
ax.set_xticks(())
ax.set_yticks(())
plt.text(-3.5, 1.8, 'train time: %.2fs\ninertia: %f' %
(t_mini_batch, mbk.inertia_))

# Initialise the different array to all False
different = (mbk_means_labels == 4)
ax = fig.add_subplot(1, 3, 3)

for k in range(n_clusters):
different += ((k_means_labels == k) != (mbk_means_labels == order[k]))

identic = np.logical_not(different)
ax.plot(X[identic, 0], X[identic, 1], 'w',
markerfacecolor='#bbbbbb', marker='.')
ax.plot(X[different, 0], X[different, 1], 'w',
markerfacecolor='m', marker='.')
ax.set_title('Difference')
ax.set_xticks(())
ax.set_yticks(())

plt.show()

Comparison of the K-Means and MiniBatchKMeans clustering algorithms

png