ML-in-Action-Code-and-Note
ML-in-Action-Code-and-Note copied to clipboard
K-means bug fix
#@wheniseeyou K-means
from numpy import *
#step1 load DATASET
def loadDataSet(filename):
'''
输出的时候要转为mat
:param filename:
:return:
'''
dataMat =[]#数据矩阵
fr = open(filename)
for line in fr.readlines():
#拆分
curLine = line.strip().split('\t')
fltLine = list(map(float,curLine))
dataMat.append(fltLine)
return dataMat
'''
[[1.658985, 4.285136], [-3.453687, 3.424321], [4.838138, -1.151539], [-5.37971
'''
#======test====
#x=loadDataSet('/Users/admin/Desktop/testSet_K.txt')
#step2 calc distance
def distEclud(vecA,vecB):
return sqrt(sum(power(vecA-vecB,2)))
#初始化K点
def randCent(dataSet,k):
'''
这里假设dataSet是list 那么 下面的取最值就要改为下面的方法
回顾下面的算法 发现:
其随机zhixing的第二个特诊即y是从已知数据最小到最大的rand值 随机游走
:param dataSet:
:param k:
:return:
'''
#取数据集的列
n = shape(dataSet)[1]
#创建矩阵块 以k为行数 n为列
centroids = mat(zeros((k,n)))
'''
[[ 0. 0. 0.]
[ 0. 0. 0.]
[ 0. 0. 0.]
[ 0. 0. 0.]]
'''
#创造随机质心
for j in range(n):#对列遍历
#找到最小的数据集的列的数
#minJ = min(dataSet[:,j])
minJ = min(array(dataSet)[:,j])
rangeJ = float(max(array(dataSet)[:,j])-minJ)
#rangeJ = float(max(dataSet[:,j])-minJ)#这里是找到每一列的特征的最大值 去减一个ZUIXIAO
#生成随机质点
'''
np.random.rand(3,1)
#=>
[[ 0.70504333]
[ 0.88452734]
[ 0.31081878]]
'''
centroids[:,j] = mat(minJ + rangeJ*random.rand(k,1))#random.rand可以保证出现k个不一样的值
return centroids
'''
K-均值聚类算法接收4个参数,
两个必要参数为数据集和k的值,
另外两个为距离计算函数和初始化函数(可修改)。
算法采用计算质心-分配-重新计算质心反复迭代的方式,
直到所有点的分配结果不再改变。
设置flag为clusterChange=True。
'''
#K-均值算法:
def KMeans(dataSet,k,distMeas = distEclud,createCent = randCent):
'''
:param dataSet: 数据集
:param k: 你懂的
:param distMeas:距离计算方式
:param createCent: k点生成方式
:return:
'''
#取数据集的行数
m = shape(dataSet)[0]
#簇群的0矩阵
clusterAssment = mat(zeros((m,2)))
'''
[[ 0. 0.]
[ 0. 0.]
[ 0. 0.]
[ 0. 0.]]
'''
#生成随机zhixing
centroids = createCent(dataSet,k)
#为了让最后的簇群保持不表
clusterChanged = True
while clusterChanged:
clusterChanged = False
#对于数据集的每一个点
for i in range(m):
minDist =inf#无穷
minIndex = -1#初始化 类别
for j in range(k):#对于每一个知心
distJI = distMeas(centroids[j,:],dataSet[i,:])
if distJI <minDist:#迭代距离
minDist = distJI
#更新最小距离是属于哪个类别的(类别与zhixn一模一样))
minIndex = j
#当数据集的簇群点的第一个数字 即类别 不属于j类 则继续迭代
if clusterAssment[i,0] != minIndex:
clusterChanged = True
clusterAssment[i,:] = minIndex,minDist**2
print(centroids)
#更新k点
for cent in range(k):
'''
nonzeros(a)返回数组a中值不为零的元素的下标,
它的返回值是一个长度为a.ndim(数组a的轴数)的元组,
元组的每个元素都是一个整数数组,其值为非零元素的下标在对应轴上的值。
dataSet =[
[1, 1],
[1, 1],
[1, 0],
[0, 1],
[0, 1]
]
print(np.nonzero(dataSet))
#=>
(array([0, 0, 1, 1, 2, 3, 4]), array([0, 1, 0, 1, 0, 1, 1]))
上面是左右对应的 坐标的左边 与右边
'''
#找到dataset里面的 属于k类的全体点
ptsInClust = dataSet[nonzero(clusterAssment[:,0].A == cent)[0]]
#更新zhixin
centroids[cent,:] = mean(ptsInClust,axis =0)
return centroids,clusterAssment
dataMat = loadDataSet('/Users/admin/Desktop/testSet_K.txt')
print(mat(dataMat))
print(KMeans(mat(dataMat),2))
IN PY3.6 if use ur org-code it will run error! TO FIX IT the input must be change th type to MATRIX!!
[[ 1.658985 4.285136]
[-3.453687 3.424321]
[ 4.838138 -1.151539]
[-5.379713 -3.362104]
[ 0.972564 2.924086]
[-3.567919 1.531611]
[ 0.450614 -3.302219]
[-3.487105 -1.724432]
[ 2.668759 1.594842]
[-3.156485 3.191137]
[ 3.165506 -3.999838]
[-2.786837 -3.099354]
[ 4.208187 2.984927]
[-2.123337 2.943366]
[ 0.704199 -0.479481]
[-0.39237 -3.963704]
[ 2.831667 1.574018]
[-0.790153 3.343144]
[ 2.943496 -3.357075]
[-3.195883 -2.283926]
[ 2.336445 2.875106]
[-1.786345 2.554248]
[ 2.190101 -1.90602 ]
[-3.403367 -2.778288]
[ 1.778124 3.880832]
[-1.688346 2.230267]
[ 2.592976 -2.054368]
[-4.007257 -3.207066]
[ 2.257734 3.387564]
[-2.679011 0.785119]
[ 0.939512 -4.023563]
[-3.674424 -2.261084]
[ 2.046259 2.735279]
[-3.18947 1.780269]
[ 4.372646 -0.822248]
[-2.579316 -3.497576]
[ 1.889034 5.1904 ]
[-0.798747 2.185588]
[ 2.83652 -2.658556]
[-3.837877 -3.253815]
[ 2.096701 3.886007]
[-2.709034 2.923887]
[ 3.367037 -3.184789]
[-2.121479 -4.232586]
[ 2.329546 3.179764]
[-3.284816 3.273099]
[ 3.091414 -3.815232]
[-3.762093 -2.432191]
[ 3.542056 2.778832]
[-1.736822 4.241041]
[ 2.127073 -2.98368 ]
[-4.323818 -3.938116]
[ 3.792121 5.135768]
[-4.786473 3.358547]
[ 2.624081 -3.260715]
[-4.009299 -2.978115]
[ 2.493525 1.96371 ]
[-2.513661 2.642162]
[ 1.864375 -3.176309]
[-3.171184 -3.572452]
[ 2.89422 2.489128]
[-2.562539 2.884438]
[ 3.491078 -3.947487]
[-2.565729 -2.012114]
[ 3.332948 3.983102]
[-1.616805 3.573188]
[ 2.280615 -2.559444]
[-2.651229 -3.103198]
[ 2.321395 3.154987]
[-1.685703 2.939697]
[ 3.031012 -3.620252]
[-4.599622 -2.185829]
[ 4.196223 1.126677]
[-2.133863 3.093686]
[ 4.668892 -2.562705]
[-2.793241 -2.149706]
[ 2.884105 3.043438]
[-2.967647 2.848696]
[ 4.479332 -1.764772]
[-4.905566 -2.91107 ]]
[[ 0.83986288 -1.83976116]
[-4.0469936 -1.40467133]]
[[ 2.47925453 0.21182184]
[-3.10532438 -0.12876332]]
[[ 2.71473038 0.18858278]
[-2.9219568 -0.07998038]]
(matrix([[ 2.71473038, 0.18858278],
[-2.9219568 , -0.07998038]]), matrix([[ 0. , 17.89634662],
[ 1. , 12.56286513],
[ 0. , 6.30478631],
[ 1. , 16.81290103],
[ 0. , 10.51812157],
[ 1. , 3.01449392],
[ 0. , 17.31191999],
[ 1. , 3.02361363],
[ 0. , 1.97967838],
[ 1. , 10.75521236],
[ 0. , 17.74606725],
[ 1. , 9.13487445],
[ 0. , 10.04995372],
[ 1. , 9.77841689],
[ 0. , 4.48854562],
[ 1. , 21.48211857],
[ 0. , 1.93310494],
[ 1. , 16.26236793],
[ 0. , 12.62402277],
[ 1. , 4.93241188],
[ 0. , 7.36050686],
[ 1. , 8.22877329],
[ 0. , 4.66259677],
[ 1. , 7.51261982],
[ 0. , 14.50993584],
[ 1. , 6.85903854],
[ 0. , 5.04565231],
[ 1. , 10.95654103],
[ 0. , 10.44232656],
[ 1. , 0.80741959],
[ 0. , 20.89357231],
[ 1. , 5.32341991],
[ 0. , 6.93251564],
[ 1. , 3.53209105],
[ 0. , 3.77046308],
[ 1. , 11.79736257],
[ 0. , 25.69995006],
[ 1. , 9.64081992],
[ 0. , 8.12103192],
[ 1. , 10.91213604],
[ 0. , 14.05290621],
[ 1. , 9.06855533],
[ 0. , 11.80514107],
[ 1. , 17.88489819],
[ 0. , 9.09553212],
[ 1. , 11.37480809],
[ 0. , 16.17242331],
[ 1. , 6.23872366],
[ 0. , 7.39385874],
[ 1. , 20.07577022],
[ 0. , 10.4085923 ],
[ 1. , 16.85042532],
[ 0. , 25.63541221],
[ 1. , 15.29989117],
[ 0. , 11.90587245],
[ 1. , 9.58149736],
[ 0. , 3.20000848],
[ 1. , 7.57676457],
[ 0. , 12.04560092],
[ 1. , 12.25947225],
[ 0. , 5.32472486],
[ 1. , 8.91695746],
[ 0. , 17.70978882],
[ 1. , 3.86003859],
[ 0. , 14.78056918],
[ 1. , 15.0490604 ],
[ 0. , 7.74010731],
[ 1. , 9.21313835],
[ 0. , 8.95426674],
[ 1. , 10.64677491],
[ 0. , 14.60725641],
[ 1. , 7.24915895],
[ 0. , 3.07484117],
[ 1. , 10.6932501 ],
[ 0. , 11.38833208],
[ 1. , 4.30033192],
[ 0. , 8.17888612],
[ 1. , 8.5792329 ],
[ 0. , 6.92941377],
[ 1. , 11.94977392]]))
Process finished with exit code 0