Chapter 7 データマイニング: クラスター分析

listing07-01.py
def euclidD(point1, point2):  # points are equal-length lists or tuples
    total = 0
    for index in range(len(point1)):
        diff = (point1[index] - point2[index]) ** 2
        total = total + diff

    euclidDistance = math.sqrt(total)
    return euclidDistance
listing07-02.py
def readFile(filename):
    with open(filename, "r") as dataFile:
        dataDict = {}  # start with empty data dictionary

        key = 0
        for aLine in dataFile:
            key = key + 1  # increment line number
            score = int(aLine)  # convert string to int

            dataDict[key] = [score]  # add to data dictionary

        return dataDict
session07-01.py
# session 7-1
readFile("cs150exams.txt")
listing07-03.py
total = 0
for aNum in range(1, 11):
    total = total + aNum
print(total)
listing07-04.py
total = 0
aNum = 1  # initialization
while aNum <= 10:  # condition
    total = total + aNum
    aNum = aNum + 1  # change of state
print(total)
listing07-05.py
total = 0
aNum = 1
while aNum <= 10:
    total = total + aNum
print(total)
listing07-06.py
def readFile(filename):
    with open(filename, "r") as dataFile:

        dataDict = {}

        key = 0
        aLine = dataFile.readline()
        while aLine != "":
            key = key + 1
            score = int(aLine)
            dataDict[key] = [score]

            aLine = dataFile.readline()

        return dataDict
listing07-07.py
def createCentroids(k, dataDict):
    centroids = []
    centroidCount = 0
    centroidKeys = []  # list of unique keys

    while centroidCount < k:
        rKey = random.randint(1, len(dataDict))
        if rKey not in centroidKeys:  # if key not already selected
            centroids.append(dataDict[rKey])  # add to dictionary
            centroidKeys.append(rKey)  # add key to selected keys
            centroidCount = centroidCount + 1

    return centroids
listing07-08.py
def createClusters(k, centroids, dataDict, repeats):
    for aPass in range(repeats):
        print("****PASS", aPass + 1, "****")
        clusters = []  # create list of k empty lists
        for i in range(k):
            clusters.append([])

        for aKey in dataDict:  # calculate distance to centroid
            distances = []
            for clusterIndex in range(k):
                dToC = euclidD(dataDict[aKey], centroids[clusterIndex])
                distances.append(dToC)

            minDist = min(distances)  # find minimum distance
            index = distances.index(minDist)

            clusters[index].append(aKey)  # add to cluster

        dimensions = len(dataDict[1])  # recompute the clusters
        for clusterIndex in range(k):
            sums = [0] * dimensions  # init sum for each dimension
            for aKey in clusters[clusterIndex]:
                dataPoints = dataDict[aKey]
                for ind in range(len(dataPoints)):  # calculate sums
                    sums[ind] = sums[ind] + dataPoints[ind]
            for ind in range(len(sums)):  # calculate average
                clusterLen = len(clusters[clusterIndex])
                if clusterLen != 0:  # do not divide by 0
                    sums[ind] = sums[ind] / clusterLen

                centroids[clusterIndex] = sums  # assign avg to centroids

            for c in clusters:  # output the clusters
                print("CLUSTER")
                for key in c:
                    print(dataDict[key], end=" ")
                print()

            return clusters
listing07-09.py
def clusterAnalysis(dataFile):
    examDict = readFile(dataFile)
    examCentroids = createCentroids(5, examDict)
    examClusters = createClusters(5, examCentroids, examDict, 3)
session07-02.py
clusterAnalysis("cs150exams.txt")


session07-03.py
import csv
with open("earthquakes.csv", "r") as dataFile:
    csvReader = csv.reader(dataFile)  # get iterator
    titles = next(csvReader)  # read titles line
    print("titles:", titles)  # output titles

    earthquakeLine = next(csvReader)  # read first earthquake line
    print("earthquake:", earthquakeLine)  # output all data
    print("latitude:", earthquakeLine[1])  # output latitude
    print("longitude:", earthquakeLine[2])  # output longitude
listing07-10.py
import csv


def readEarthquakeFile(filename):
    with open(filename, "r") as dataFile:
        csvReader = csv.reader(dataFile)
        titles = next(csvReader)  # read and skip titles
        dataDict = {}
        key = 0

        for aLine in csvReader:
            key = key + 1  # key is the line number
            lat = float(aLine[1])  # extract latitude
            long = float(aLine[2])  # extract longitude
            dataDict[key] = [long, lat]

    return dataDict
listing07-11.py
import turtle
from readearthquakes import *
from dist import *
from createclusters import *


def visualizeQuakes(dataFile):
    dataDict = readEarthquakeFile(dataFile)
    quakeCentroids = createCentroids(6, dataDict)
    clusters = createClusters(6, quakeCentroids, dataDict, 7)

    quakeT = turtle.Turtle()
    quakeWin = turtle.Screen()
    quakeWin.bgpic("worldmap.gif")
    quakeWin.screensize(448, 252)

    wFactor = (quakeWin.screensize()[0]/2)/180
    hFactor = (quakeWin.screensize()[1]/2)/90

    quakeT.hideturtle()
    quakeT.up()

    colorList = ["red", "lawngreen", "blue", "orange", "cyan", "yellow"]

    for clusterIndex in range(6):
        quakeT.color(colorList[clusterIndex])  # choose cluster color
        for aKey in clusters[clusterIndex]:
            lon = dataDict[aKey][0]
            lat = dataDict[aKey][1]
            quakeT.goto(lon * wFactor, lat * hFactor)
            quakeT.dot()
    quakeWin.exitonclick()


visualizeQuakes("earthquakes.csv")