[学习笔记][Python机器学习:预测分析核心算法][多变量回归:使用交叉验证来估计套索模型的样本外错误]

  1 import numpy
  2 from sklearn import datasets, linear_model
  3 from sklearn.linear_model import LassoCV
  4 from math import sqrt
  5 import matplotlib.pyplot as plot
  6 
  7 #read data into iterable
  8 #target_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
  9 #data = urllib2.urlopen(target_url)
 10 target_url_file = open('winequality-red.csv','r')
 11 data = target_url_file.readlines()
 12 target_url_file.close()
 13 
 14 xList = []
 15 labels = []
 16 names = []
 17 firstLine = True
 18 for line in data:
 19     if firstLine:
 20         names = line.strip().split(";")
 21         firstLine = False
 22     else:
 23         #split on semi-colon
 24         row = line.strip().split(";")
 25         #put labels in separate array
 26         labels.append(float(row[-1]))
 27         #remove label from row
 28         row.pop()
 29         #convert row to floats
 30         floatRow = [float(num) for num in row]
 31         xList.append(floatRow)
 32 
 33 #Normalize columns in x and labels
 34 #Note: be careful about normalization.
 35 #Some penalized regression packages include it and some don't.
 36 
 37 nrows = len(xList)
 38 ncols = len(xList[0])
 39 
 40 #calculate means and variances
 41 xMeans = []
 42 xSD = []
 43 for i in range(ncols):
 44     col = [xList[j][i] for j in range(nrows)]
 45     mean = sum(col)/nrows
 46     xMeans.append(mean)
 47     colDiff = [(xList[j][i] - mean) for j in range(nrows)]
 48     sumSq = sum([colDiff[i] * colDiff[i] for i in range(nrows)])
 49     stdDev = sqrt(sumSq/nrows)
 50     xSD.append(stdDev)
 51 
 52 #use calculate mean and standard deviation to normalize xList
 53 xNormalized = []
 54 for i in range(nrows):
 55     rowNormalized = [(xList[i][j] - xMeans[j])/xSD[j] for j in range(ncols)]
 56     xNormalized.append(rowNormalized)
 57 
 58 #Normalize labels
 59 meanLabel = sum(labels)/nrows
 60 sdLabel = sqrt(sum([(labels[i] - meanLabel) * (labels[i] - meanLabel) for i in range(nrows)])/nrows)
 61 
 62 labelNormalized = [(labels[i] - meanLabel)/sdLabel for i in range(nrows)]
 63 
 64 #Convert list of list to np array for input to sklearn packages
 65 
 66 #Unnormalized labels
 67 Y = numpy.array(labels)
 68 
 69 #normalized lables
 70 Y = numpy.array(labelNormalized)
 71 
 72 #Unnormalized X's
 73 X = numpy.array(xList)
 74 
 75 #Normlized Xss
 76 X = numpy.array(xNormalized)
 77 
 78 #Call LassoCV from sklearn.linear_model
 79 #10折交叉验证
 80 wineModel = LassoCV(cv=10).fit(X, Y)
 81 
 82 # Display results
 83 
 84 
 85 plot.figure()
 86 plot.figure(figsize=(12,8))
 87 #随着alpha值的变化,均方误差的变化曲线
 88 plot.plot(wineModel.alphas_, wineModel.mse_path_, ':')
 89 #验证过程中,随着alpha值的变化,均方误差的平均曲线,并设置的alpha变化区域
 90 plot.plot(wineModel.alphas_, wineModel.mse_path_.mean(axis=-1),
 91           label='Average MSE Across Folds', linewidth=2)
 92 #最佳的alpha值,每次验证系统认为的最合适的alpha值
 93 plot.axvline(wineModel.alpha_, linestyle='dotted',label='CV Estimate of Best alpha')
 94 #这种轴半对数刻度曲线是将自变量对10取对数,可以有效的看出数据指数型变化时的衰变情况。
 95 plot.semilogx()
 96 #为图表打标注
 97 plot.legend()
 98 #当前的图表和子图可以使用plt.gcf()和plt.gca()获得,分别表示Get Current Figure和Get Current Axes。
 99 ax = plot.gca()
100 #x轴反向
101 ax.invert_xaxis()
102 
103 plot.xlabel('alpha')
104 plot.ylabel('Mean Square Error')
105 plot.axis('tight')
106 plot.show()
107 
108 #print out the value of alpha that minimizes the Cv-error
109 print("alpha Value that Minimizes CV Error  ",wineModel.alpha_)
110 print("Minimum MSE  ", min(wineModel.mse_path_.mean(axis=-1)))

[学习笔记][Python机器学习:预测分析核心算法][多变量回归:使用交叉验证来估计套索模型的样本外错误]

alpha Value that Minimizes CV Error   0.013561387700964642
Minimum MSE   0.6655849206002853