Chapter 7 データマイニング: クラスター分析¶
def euclidD(point1, point2): # points are equal-length lists or tuples
total = 0
for index in range(len(point1)):
diff = (point1[index] - point2[index]) ** 2
total = total + diff
euclidDistance = math.sqrt(total)
return euclidDistance
def readFile(filename):
with open(filename, "r") as dataFile:
dataDict = {} # start with empty data dictionary
key = 0
for aLine in dataFile:
key = key + 1 # increment line number
score = int(aLine) # convert string to int
dataDict[key] = [score] # add to data dictionary
return dataDict
# session 7-1
readFile("cs150exams.txt")
total = 0
for aNum in range(1, 11):
total = total + aNum
print(total)
total = 0
aNum = 1 # initialization
while aNum <= 10: # condition
total = total + aNum
aNum = aNum + 1 # change of state
print(total)
total = 0
aNum = 1
while aNum <= 10:
total = total + aNum
print(total)
def readFile(filename):
with open(filename, "r") as dataFile:
dataDict = {}
key = 0
aLine = dataFile.readline()
while aLine != "":
key = key + 1
score = int(aLine)
dataDict[key] = [score]
aLine = dataFile.readline()
return dataDict
def createCentroids(k, dataDict):
centroids = []
centroidCount = 0
centroidKeys = [] # list of unique keys
while centroidCount < k:
rKey = random.randint(1, len(dataDict))
if rKey not in centroidKeys: # if key not already selected
centroids.append(dataDict[rKey]) # add to dictionary
centroidKeys.append(rKey) # add key to selected keys
centroidCount = centroidCount + 1
return centroids
def createClusters(k, centroids, dataDict, repeats):
for aPass in range(repeats):
print("****PASS", aPass + 1, "****")
clusters = [] # create list of k empty lists
for i in range(k):
clusters.append([])
for aKey in dataDict: # calculate distance to centroid
distances = []
for clusterIndex in range(k):
dToC = euclidD(dataDict[aKey], centroids[clusterIndex])
distances.append(dToC)
minDist = min(distances) # find minimum distance
index = distances.index(minDist)
clusters[index].append(aKey) # add to cluster
dimensions = len(dataDict[1]) # recompute the clusters
for clusterIndex in range(k):
sums = [0] * dimensions # init sum for each dimension
for aKey in clusters[clusterIndex]:
dataPoints = dataDict[aKey]
for ind in range(len(dataPoints)): # calculate sums
sums[ind] = sums[ind] + dataPoints[ind]
for ind in range(len(sums)): # calculate average
clusterLen = len(clusters[clusterIndex])
if clusterLen != 0: # do not divide by 0
sums[ind] = sums[ind] / clusterLen
centroids[clusterIndex] = sums # assign avg to centroids
for c in clusters: # output the clusters
print("CLUSTER")
for key in c:
print(dataDict[key], end=" ")
print()
return clusters
def clusterAnalysis(dataFile):
examDict = readFile(dataFile)
examCentroids = createCentroids(5, examDict)
examClusters = createClusters(5, examCentroids, examDict, 3)
clusterAnalysis("cs150exams.txt")
import csv
with open("earthquakes.csv", "r") as dataFile:
csvReader = csv.reader(dataFile) # get iterator
titles = next(csvReader) # read titles line
print("titles:", titles) # output titles
earthquakeLine = next(csvReader) # read first earthquake line
print("earthquake:", earthquakeLine) # output all data
print("latitude:", earthquakeLine[1]) # output latitude
print("longitude:", earthquakeLine[2]) # output longitude
import csv
def readEarthquakeFile(filename):
with open(filename, "r") as dataFile:
csvReader = csv.reader(dataFile)
titles = next(csvReader) # read and skip titles
dataDict = {}
key = 0
for aLine in csvReader:
key = key + 1 # key is the line number
lat = float(aLine[1]) # extract latitude
long = float(aLine[2]) # extract longitude
dataDict[key] = [long, lat]
return dataDict
import turtle
from readearthquakes import *
from dist import *
from createclusters import *
def visualizeQuakes(dataFile):
dataDict = readEarthquakeFile(dataFile)
quakeCentroids = createCentroids(6, dataDict)
clusters = createClusters(6, quakeCentroids, dataDict, 7)
quakeT = turtle.Turtle()
quakeWin = turtle.Screen()
quakeWin.bgpic("worldmap.gif")
quakeWin.screensize(448, 252)
wFactor = (quakeWin.screensize()[0]/2)/180
hFactor = (quakeWin.screensize()[1]/2)/90
quakeT.hideturtle()
quakeT.up()
colorList = ["red", "lawngreen", "blue", "orange", "cyan", "yellow"]
for clusterIndex in range(6):
quakeT.color(colorList[clusterIndex]) # choose cluster color
for aKey in clusters[clusterIndex]:
lon = dataDict[aKey][0]
lat = dataDict[aKey][1]
quakeT.goto(lon * wFactor, lat * hFactor)
quakeT.dot()
quakeWin.exitonclick()
visualizeQuakes("earthquakes.csv")