python机器学习系列:支持向量机(SVM)的应用以及实现

我记录下的这些东西,如果是有哪些不懂得地方,我强烈建议参考我在这里的书籍。另外还有《机器学习实战》《深度学习》这本花书等,利用好搜索引擎也是一大好利器。

关于这篇文章,我还是和以前记录相关的机器学习知识之类篇章一样的风格。

不懂可进入这里的对应的教程,看不懂可借助翻译插件/软件(实际上借助这些看起来轻松多了,看英文头疼的厉害,如果是对于初学者)。

SVM的实现

SVM算法是有一点难理解的,但是坚持看一些文章和上面说的那些书籍之后就会发现其实也就那么回事。

关于SVM的实现(仅作通俗说明,以二维为例):由于这个算法是根据支持向量得出两个函数,而我们取的是这两条线性函数的距离的中间值,从而得到了决策边界函数,这样任务也就完成了。但是由于参数的不同,取这个决策边界是可以有多个甚至是无穷个的,那么取得最优的参数是可以利用梯度下降算法的(符合凸二次规划)。得到了最优的参数就可以得出决策边界的函数了。

涉及到不少的数学知识…我想我大概是说对了吧,哈哈…

为了实现这个算法,必须要提前了解这算法相关的知识,不然真的是寸步难行啊。

下面是铺助理解链接,不懂还要翻书看吴恩达老师的教程:

以下就是完整的实现代码了:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import numpy as np
class Support_Vector_Machine:
def __init__(self, visualization=True):
self.visualization = visualization
self.colors = {1:'r', -1:'b'}
if self.visualization:
self.fig = plt.figure()
self.ax = self.fig.add_subplot(1,1,1)
def fit(self, data):
self.data = data
opt_dict = {}
transforms = [[1,1],
[-1,1],
[-1,-1],
[1,-1]]
all_data = []
for yi in self.data:
for featureset in self.data[yi]:
for feature in featureset:
all_data.append(feature)
self.max_feature_value = max(all_data)
self.min_feature_value = min(all_data)
all_data = None
#指定梯度下降的步子大小
step_sizes = [self.max_feature_value * 0.1,
self.max_feature_value * 0.01,
self.max_feature_value * 0.001,]
#b的假设大小,最为重要的是参数w,而不是参数b
b_range_multiple = 2
b_multiple =5
latest_optimum = self.max_feature_value*10 #最大的步子
for step in step_sizes:
w = np.array([latest_optimum, latest_optimum])
optimized = False
while not optimized:
for b in np.arange(-1*(self.max_feature_value * b_range_multiple),
self.max_feature_value * b_range_multiple,
step * b_multiple):
for transformation in transforms:
w_t = w*transformation
found_option = True
for i in self.data:
for xi in self.data[i]:
yi = i
if not yi*(np.dot(w_t, xi)+b) >= 1:
found_option = False
#如果约束优化条件成立
if found_option:
opt_dict[np.linalg.norm(w_t)] = [w_t, b]
#若是值为负数则停止进一步的优化步子
if w[0] <0:
optimized = True
print('Optimized a step.')
else:
w = w - step
norms = sorted([n for n in opt_dict])
opt_choice = opt_dict[norms[0]]
self.w = opt_choice[0]
self.b = opt_choice[1]
latest_optimum = opt_choice[0][0]+step*2
for i in self.data:
for xi in self.data[i]:
yi = i
print(xi, ':', yi*(np.dot(self.w, xi)+self.b))
#预测部分
def predict(self, features):
classification = np.sign(np.dot(np.array(features), self.w)+self.b)
if classification != 0 and self.visualization:
self.ax.scatter(features[0], features[1], s=200, marker='*', c=self.colors[classification])
return classification
#可视化部分
def visualize(self):
[[self.ax.scatter(x[0], x[1], s=100, color=self.colors[i]) for x in data_dict[i]] for i in data_dict]
def hyperplane(x, w, b, v):
return (-w[0]*x-b+v) / w[1]
datarange = (self.min_feature_value*0.9, self.max_feature_value*1.1)
hyp_x_min = datarange[0]
hyp_x_max = datarange[1]
psv1 = hyperplane(hyp_x_min, self.w, self.b, 1)
psv2 = hyperplane(hyp_x_max, self.w, self.b, 1)
self.ax.plot([hyp_x_min, hyp_x_max], [psv1,psv2], 'k')
nsv1 = hyperplane(hyp_x_min, self.w, self.b, -1)
nsv2 = hyperplane(hyp_x_max, self.w, self.b, -1)
self.ax.plot([hyp_x_min, hyp_x_max], [nsv1,nsv2], 'k')
db1 = hyperplane(hyp_x_min, self.w, self.b, 0)
db2 = hyperplane(hyp_x_max, self.w, self.b, 0)
self.ax.plot([hyp_x_min, hyp_x_max], [db1, db2], 'y--')
plt.show()

铺助理解链接:

简要数据集预测以及结果可视化

以下是完整代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import matplotlib.pyplot as plt
import numpy as np
plt.style.use('ggplot')
class Support_Vector_Machine:
def __init__(self, visualization=True):
self.visualization = visualization
self.colors = {1:'r', -1:'b'}
if self.visualization:
self.fig = plt.figure()
self.ax = self.fig.add_subplot(1,1,1)
def fit(self, data):
self.data = data
opt_dict = {}
transforms = [[1,1],
[-1,1],
[-1,-1],
[1,-1]]
all_data = []
for yi in self.data:
for featureset in self.data[yi]:
for feature in featureset:
all_data.append(feature)
self.max_feature_value = max(all_data)
self.min_feature_value = min(all_data)
all_data = None
#指定梯度下降的步子大小
step_sizes = [self.max_feature_value * 0.1,
self.max_feature_value * 0.01,
self.max_feature_value * 0.001,]
#b的假设大小,最为重要的是参数w,而不是参数b
b_range_multiple = 2
b_multiple =5
latest_optimum = self.max_feature_value*10 #最大的步子
for step in step_sizes:
w = np.array([latest_optimum, latest_optimum])
optimized = False
while not optimized:
for b in np.arange(-1*(self.max_feature_value * b_range_multiple),
self.max_feature_value * b_range_multiple,
step * b_multiple):
for transformation in transforms:
w_t = w*transformation
found_option = True
for i in self.data:
for xi in self.data[i]:
yi = i
if not yi*(np.dot(w_t, xi)+b) >= 1:
found_option = False
#如果约束优化条件成立
if found_option:
opt_dict[np.linalg.norm(w_t)] = [w_t, b]
#若是值为负数则停止进一步的优化步子
if w[0] <0:
optimized = True
print('Optimized a step.')
else:
w = w - step
norms = sorted([n for n in opt_dict])
opt_choice = opt_dict[norms[0]]
self.w = opt_choice[0]
self.b = opt_choice[1]
latest_optimum = opt_choice[0][0]+step*2
for i in self.data:
for xi in self.data[i]:
yi = i
print(xi, ':', yi*(np.dot(self.w, xi)+self.b))
#预测部分
def predict(self, features):
classification = np.sign(np.dot(np.array(features), self.w)+self.b)
if classification != 0 and self.visualization:
self.ax.scatter(features[0], features[1], s=200, marker='*', c=self.colors[classification])
return classification
#可视化部分
def visualize(self):
[[self.ax.scatter(x[0], x[1], s=100, color=self.colors[i]) for x in data_dict[i]] for i in data_dict]
def hyperplane(x, w, b, v):
return (-w[0]*x-b+v) / w[1]
datarange = (self.min_feature_value*0.9, self.max_feature_value*1.1)
hyp_x_min = datarange[0]
hyp_x_max = datarange[1]
psv1 = hyperplane(hyp_x_min, self.w, self.b, 1)
psv2 = hyperplane(hyp_x_max, self.w, self.b, 1)
self.ax.plot([hyp_x_min, hyp_x_max], [psv1,psv2], 'k')
nsv1 = hyperplane(hyp_x_min, self.w, self.b, -1)
nsv2 = hyperplane(hyp_x_max, self.w, self.b, -1)
self.ax.plot([hyp_x_min, hyp_x_max], [nsv1,nsv2], 'k')
db1 = hyperplane(hyp_x_min, self.w, self.b, 0)
db2 = hyperplane(hyp_x_max, self.w, self.b, 0)
self.ax.plot([hyp_x_min, hyp_x_max], [db1, db2], 'y--')
plt.show()
#训练数据集
data_dict = {-1:np.array([[1,7],
[2,8],
[3,8],]),
1:np.array([[5,1],
[6,-1],
[7,3],])}
svm = Support_Vector_Machine()
svm.fit(data=data_dict)
#预测数据集
predict_us = [[0,10],
[1,3],
[3,4],
[3,5],
[5,5],
[5,6],
[6,-5],
[5,8]]
for p in predict_us:
svm.predict(p)
svm.visualize()

这样一来就完成了算法的实现了,可视化的图表如下:

实际上这只是算法的简单实现,许多的细节并没有照顾到。而且这个算法的基础必须要牢固,不然很难理解上面的代码。

SVM进阶

相关到核函数硬间隔最大化软间隔最大化等知识,其中的核函数软间隔最大化针对于非线性数据(即线性不可分),硬间隔最大化针对于线性可分数据类型,这需要自行去了解、理解。在上面说的书籍中可以找到相关的知识。

以下是关于核函数软间隔最大化的针对于非线性数据python代码的实现,可以理解为SVM的底层实现的一部分,可以更好的理解内部实现的于原理。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
# Mathieu Blondel, September 2010
# License: BSD 3 clause
# http://www.mblondel.org/journal/2010/09/19/support-vector-machines-in-python/
# visualizing what translating to another dimension does
# and bringing back to 2D:
# https://www.youtube.com/watch?v=3liCbRZPrZA
# Docs: http://cvxopt.org/userguide/coneprog.html#quadratic-programming
# Docs qp example: http://cvxopt.org/examples/tutorial/qp.html
# Nice tutorial:
# https://courses.csail.mit.edu/6.867/wiki/images/a/a7/Qp-cvxopt.pdf
import numpy as np
from numpy import linalg
import cvxopt
import cvxopt.solvers
def linear_kernel(x1, x2):
return np.dot(x1, x2)
def polynomial_kernel(x, y, p=3):
return (1 + np.dot(x, y)) ** p
def gaussian_kernel(x, y, sigma=5.0):
return np.exp(-linalg.norm(x-y)**2 / (2 * (sigma ** 2)))
class SVM(object):
def __init__(self, kernel=linear_kernel, C=None):
self.kernel = kernel
self.C = C
if self.C is not None: self.C = float(self.C)
def fit(self, X, y):
n_samples, n_features = X.shape
# Gram matrix
K = np.zeros((n_samples, n_samples))
for i in range(n_samples):
for j in range(n_samples):
K[i,j] = self.kernel(X[i], X[j])
P = cvxopt.matrix(np.outer(y,y) * K)
q = cvxopt.matrix(np.ones(n_samples) * -1)
A = cvxopt.matrix(y, (1,n_samples))
b = cvxopt.matrix(0.0)
if self.C is None:
G = cvxopt.matrix(np.diag(np.ones(n_samples) * -1))
h = cvxopt.matrix(np.zeros(n_samples))
else:
tmp1 = np.diag(np.ones(n_samples) * -1)
tmp2 = np.identity(n_samples)
G = cvxopt.matrix(np.vstack((tmp1, tmp2)))
tmp1 = np.zeros(n_samples)
tmp2 = np.ones(n_samples) * self.C
h = cvxopt.matrix(np.hstack((tmp1, tmp2)))
# solve QP problem
solution = cvxopt.solvers.qp(P, q, G, h, A, b)
# Lagrange multipliers
a = np.ravel(solution['x'])
# Support vectors have non zero lagrange multipliers
sv = a > 1e-5
ind = np.arange(len(a))[sv]
self.a = a[sv]
self.sv = X[sv]
self.sv_y = y[sv]
print("%d support vectors out of %d points" % (len(self.a), n_samples))
# Intercept
self.b = 0
for n in range(len(self.a)):
self.b += self.sv_y[n]
self.b -= np.sum(self.a * self.sv_y * K[ind[n],sv])
self.b /= len(self.a)
# Weight vector
if self.kernel == linear_kernel:
self.w = np.zeros(n_features)
for n in range(len(self.a)):
self.w += self.a[n] * self.sv_y[n] * self.sv[n]
else:
self.w = None
def project(self, X):
if self.w is not None:
return np.dot(X, self.w) + self.b
else:
y_predict = np.zeros(len(X))
for i in range(len(X)):
s = 0
for a, sv_y, sv in zip(self.a, self.sv_y, self.sv):
s += a * sv_y * self.kernel(X[i], sv)
y_predict[i] = s
return y_predict + self.b
def predict(self, X):
return np.sign(self.project(X))
if __name__ == "__main__":
import pylab as pl
def gen_lin_separable_data():
# generate training data in the 2-d case
mean1 = np.array([0, 2])
mean2 = np.array([2, 0])
cov = np.array([[0.8, 0.6], [0.6, 0.8]])
X1 = np.random.multivariate_normal(mean1, cov, 100)
y1 = np.ones(len(X1))
X2 = np.random.multivariate_normal(mean2, cov, 100)
y2 = np.ones(len(X2)) * -1
return X1, y1, X2, y2
def gen_non_lin_separable_data():
mean1 = [-1, 2]
mean2 = [1, -1]
mean3 = [4, -4]
mean4 = [-4, 4]
cov = [[1.0,0.8], [0.8, 1.0]]
X1 = np.random.multivariate_normal(mean1, cov, 50)
X1 = np.vstack((X1, np.random.multivariate_normal(mean3, cov, 50)))
y1 = np.ones(len(X1))
X2 = np.random.multivariate_normal(mean2, cov, 50)
X2 = np.vstack((X2, np.random.multivariate_normal(mean4, cov, 50)))
y2 = np.ones(len(X2)) * -1
return X1, y1, X2, y2
def gen_lin_separable_overlap_data():
# generate training data in the 2-d case
mean1 = np.array([0, 2])
mean2 = np.array([2, 0])
cov = np.array([[1.5, 1.0], [1.0, 1.5]])
X1 = np.random.multivariate_normal(mean1, cov, 100)
y1 = np.ones(len(X1))
X2 = np.random.multivariate_normal(mean2, cov, 100)
y2 = np.ones(len(X2)) * -1
return X1, y1, X2, y2
def split_train(X1, y1, X2, y2):
X1_train = X1[:90]
y1_train = y1[:90]
X2_train = X2[:90]
y2_train = y2[:90]
X_train = np.vstack((X1_train, X2_train))
y_train = np.hstack((y1_train, y2_train))
return X_train, y_train
def split_test(X1, y1, X2, y2):
X1_test = X1[90:]
y1_test = y1[90:]
X2_test = X2[90:]
y2_test = y2[90:]
X_test = np.vstack((X1_test, X2_test))
y_test = np.hstack((y1_test, y2_test))
return X_test, y_test
def plot_margin(X1_train, X2_train, clf):
def f(x, w, b, c=0):
# given x, return y such that [x,y] in on the line
# w.x + b = c
return (-w[0] * x - b + c) / w[1]
pl.plot(X1_train[:,0], X1_train[:,1], "ro")
pl.plot(X2_train[:,0], X2_train[:,1], "bo")
pl.scatter(clf.sv[:,0], clf.sv[:,1], s=100, c="g")
# w.x + b = 0
a0 = -4; a1 = f(a0, clf.w, clf.b)
b0 = 4; b1 = f(b0, clf.w, clf.b)
pl.plot([a0,b0], [a1,b1], "k")
# w.x + b = 1
a0 = -4; a1 = f(a0, clf.w, clf.b, 1)
b0 = 4; b1 = f(b0, clf.w, clf.b, 1)
pl.plot([a0,b0], [a1,b1], "k--")
# w.x + b = -1
a0 = -4; a1 = f(a0, clf.w, clf.b, -1)
b0 = 4; b1 = f(b0, clf.w, clf.b, -1)
pl.plot([a0,b0], [a1,b1], "k--")
pl.axis("tight")
pl.show()
def plot_contour(X1_train, X2_train, clf):
pl.plot(X1_train[:,0], X1_train[:,1], "ro")
pl.plot(X2_train[:,0], X2_train[:,1], "bo")
pl.scatter(clf.sv[:,0], clf.sv[:,1], s=100, c="g")
X1, X2 = np.meshgrid(np.linspace(-6,6,50), np.linspace(-6,6,50))
X = np.array([[x1, x2] for x1, x2 in zip(np.ravel(X1), np.ravel(X2))])
Z = clf.project(X).reshape(X1.shape)
pl.contour(X1, X2, Z, [0.0], colors='k', linewidths=1, origin='lower')
pl.contour(X1, X2, Z + 1, [0.0], colors='grey', linewidths=1, origin='lower')
pl.contour(X1, X2, Z - 1, [0.0], colors='grey', linewidths=1, origin='lower')
pl.axis("tight")
pl.show()
def test_linear():
X1, y1, X2, y2 = gen_lin_separable_data()
X_train, y_train = split_train(X1, y1, X2, y2)
X_test, y_test = split_test(X1, y1, X2, y2)
clf = SVM()
clf.fit(X_train, y_train)
y_predict = clf.predict(X_test)
correct = np.sum(y_predict == y_test)
print("%d out of %d predictions correct" % (correct, len(y_predict)))
plot_margin(X_train[y_train==1], X_train[y_train==-1], clf)
def test_non_linear():
X1, y1, X2, y2 = gen_non_lin_separable_data()
X_train, y_train = split_train(X1, y1, X2, y2)
X_test, y_test = split_test(X1, y1, X2, y2)
clf = SVM(polynomial_kernel)
clf.fit(X_train, y_train)
y_predict = clf.predict(X_test)
correct = np.sum(y_predict == y_test)
print("%d out of %d predictions correct" % (correct, len(y_predict)))
plot_contour(X_train[y_train==1], X_train[y_train==-1], clf)
def test_soft():
X1, y1, X2, y2 = gen_lin_separable_overlap_data()
X_train, y_train = split_train(X1, y1, X2, y2)
X_test, y_test = split_test(X1, y1, X2, y2)
clf = SVM(C=1000.1)
clf.fit(X_train, y_train)
y_predict = clf.predict(X_test)
correct = np.sum(y_predict == y_test)
print("%d out of %d predictions correct" % (correct, len(y_predict)))
plot_contour(X_train[y_train==1], X_train[y_train==-1], clf)
#test_linear()
#test_non_linear()
test_soft()

具体相关说明可见对应的课程地址

更多的铺助链接:

SVM的应用

还是利用了在上个文章python机器学习系列:K近邻算法(KNN)的实现及应用的实现及应用/)的实际数据集。只是将sklearn模块中的现成的拿来用了。

代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import numpy as np
from sklearn import preprocessing, neighbors, svm
from sklearn.model_selection import train_test_split
import pandas as pd
df = pd.read_csv('breast-cancer-wisconsin.data')
df.replace('?',-99999,inplace=True) #替换异常值为-99999,inplace=True表示文件中也将进行同步更改
#去除不相关的特征列
df.drop(['Id'], 1, inplace=True)
X = np.array(df.drop(['Class'], 1)) #去除标签列,自制数据集
y = np.array(df['Class'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf = svm.SVC() #分类SVM
clf.fit(X_train, y_train)
accuracy = clf.score(X_test, y_test) #得出准确值
print(accuracy)
#创建数据集来进行简单的预测
example_maasurse = np.array([[4,2,1,1,2,1,2,1,2],[4,2,1,1,2,2,2,2,1]])
example_maasurse = example_maasurse.reshape(len(example_maasurse),-1) #重朔,其中的-1可理解为,只想输出2行的情况下,后面的列我写上-1由numpy自行得出对应相符的数组,有点抽象...其实也就那么回事
prediction = clf.predict(example_maasurse)
print(prediction)

结果跟上篇介绍的用KNN的结果几乎一样,就不展示了。

关于现成算法的参数的使用可移步:

这样这篇文章基本上就这样了,需要更新的话再来补充。

---------------本文终---------------

文章作者:刘俊

最后更新:2019年01月02日 - 14:01

许可协议: 转载请保留原文链接及作者。