1 import numpy
2 from sklearn import datasets, linear_model
3 from sklearn.linear_model import LassoCV
4 from math import sqrt
5 import matplotlib.pyplot as plot
6
7 #read data into iterable
8 #target_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
9 #data = urllib2.urlopen(target_url)
10 target_url_file = open('winequality-red.csv','r')
11 data = target_url_file.readlines()
12 target_url_file.close()
13
14 xList = []
15 labels = []
16 names = []
17 firstLine = True
18 for line in data:
19 if firstLine:
20 names = line.strip().split(";")
21 firstLine = False
22 else:
23 #split on semi-colon
24 row = line.strip().split(";")
25 #put labels in separate array
26 labels.append(float(row[-1]))
27 #remove label from row
28 row.pop()
29 #convert row to floats
30 floatRow = [float(num) for num in row]
31 xList.append(floatRow)
32
33 #Normalize columns in x and labels
34 #Note: be careful about normalization.
35 #Some penalized regression packages include it and some don't.
36
37 nrows = len(xList)
38 ncols = len(xList[0])
39
40 #calculate means and variances
41 xMeans = []
42 xSD = []
43 for i in range(ncols):
44 col = [xList[j][i] for j in range(nrows)]
45 mean = sum(col)/nrows
46 xMeans.append(mean)
47 colDiff = [(xList[j][i] - mean) for j in range(nrows)]
48 sumSq = sum([colDiff[i] * colDiff[i] for i in range(nrows)])
49 stdDev = sqrt(sumSq/nrows)
50 xSD.append(stdDev)
51
52 #use calculate mean and standard deviation to normalize xList
53 xNormalized = []
54 for i in range(nrows):
55 rowNormalized = [(xList[i][j] - xMeans[j])/xSD[j] for j in range(ncols)]
56 xNormalized.append(rowNormalized)
57
58 #Normalize labels
59 meanLabel = sum(labels)/nrows
60 sdLabel = sqrt(sum([(labels[i] - meanLabel) * (labels[i] - meanLabel) for i in range(nrows)])/nrows)
61
62 labelNormalized = [(labels[i] - meanLabel)/sdLabel for i in range(nrows)]
63
64 #Convert list of list to np array for input to sklearn packages
65
66 #Unnormalized labels
67 Y = numpy.array(labels)
68
69 #normalized lables
70 Y = numpy.array(labelNormalized)
71
72 #Unnormalized X's
73 X = numpy.array(xList)
74
75 #Normlized Xss
76 X = numpy.array(xNormalized)
77
78 #Call LassoCV from sklearn.linear_model
79 #10折交叉验证
80 wineModel = LassoCV(cv=10).fit(X, Y)
81
82 # Display results
83
84
85 plot.figure()
86 plot.figure(figsize=(12,8))
87 #随着alpha值的变化,均方误差的变化曲线
88 plot.plot(wineModel.alphas_, wineModel.mse_path_, ':')
89 #验证过程中,随着alpha值的变化,均方误差的平均曲线,并设置的alpha变化区域
90 plot.plot(wineModel.alphas_, wineModel.mse_path_.mean(axis=-1),
91 label='Average MSE Across Folds', linewidth=2)
92 #最佳的alpha值,每次验证系统认为的最合适的alpha值
93 plot.axvline(wineModel.alpha_, linestyle='dotted',label='CV Estimate of Best alpha')
94 #这种轴半对数刻度曲线是将自变量对10取对数,可以有效的看出数据指数型变化时的衰变情况。
95 plot.semilogx()
96 #为图表打标注
97 plot.legend()
98 #当前的图表和子图可以使用plt.gcf()和plt.gca()获得,分别表示Get Current Figure和Get Current Axes。
99 ax = plot.gca()
100 #x轴反向
101 ax.invert_xaxis()
102
103 plot.xlabel('alpha')
104 plot.ylabel('Mean Square Error')
105 plot.axis('tight')
106 plot.show()
107
108 #print out the value of alpha that minimizes the Cv-error
109 print("alpha Value that Minimizes CV Error ",wineModel.alpha_)
110 print("Minimum MSE ", min(wineModel.mse_path_.mean(axis=-1)))