add counts function

This commit is contained in:
Joe Darby
2016-12-19 13:39:43 +00:00
parent 83bd36b139
commit 498c6ba67a
3 changed files with 185 additions and 103 deletions
+105 -61
View File
@@ -2,7 +2,9 @@
<project version="4">
<component name="ChangeListManager">
<list default="true" id="a45ae16c-c18b-46fd-bdd5-74c3ba5fabef" name="Default" comment="">
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/.idea/workspace.xml" afterPath="$PROJECT_DIR$/.idea/workspace.xml" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/src/main/scala/KMeans.scala" afterPath="$PROJECT_DIR$/src/main/scala/KMeans.scala" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/src/main/scala/XMLParser.scala" afterPath="$PROJECT_DIR$/src/main/scala/XMLParser.scala" />
</list>
<ignored path="$PROJECT_DIR$/target/" />
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
@@ -21,8 +23,8 @@
<file leaf-file-name="KMeans.scala" pinned="false" current-in-tab="true">
<entry file="file://$PROJECT_DIR$/src/main/scala/KMeans.scala">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1152">
<caret line="64" column="0" lean-forward="true" selection-start-line="64" selection-start-column="0" selection-end-line="64" selection-end-column="0" />
<state relative-caret-position="491">
<caret line="52" column="0" lean-forward="true" selection-start-line="52" selection-start-column="0" selection-end-line="52" selection-end-column="0" />
<folding>
<element signature="e#23#54#0" expanded="true" />
</folding>
@@ -30,23 +32,11 @@
</provider>
</entry>
</file>
<file leaf-file-name="ArrayOps.scala" pinned="false" current-in-tab="false">
<entry file="jar://$MAVEN_REPOSITORY$/org/scala-lang/scala-library/2.10.5/scala-library-2.10.5-sources.jar!/scala/collection/mutable/ArrayOps.scala">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-2214">
<caret line="42" column="58" lean-forward="true" selection-start-line="42" selection-start-column="58" selection-end-line="42" selection-end-column="58" />
<folding>
<element signature="e#576#614#0" expanded="false" />
</folding>
</state>
</provider>
</entry>
</file>
<file leaf-file-name="Main.scala" pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/src/main/scala/Main.scala">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="144">
<caret line="16" column="51" lean-forward="false" selection-start-line="16" selection-start-column="51" selection-end-line="16" selection-end-column="51" />
<state relative-caret-position="360">
<caret line="27" column="34" lean-forward="false" selection-start-line="27" selection-start-column="34" selection-end-line="27" selection-end-column="34" />
<folding />
</state>
</provider>
@@ -55,8 +45,8 @@
<file leaf-file-name="XMLParser.scala" pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/src/main/scala/XMLParser.scala">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="612">
<caret line="41" column="21" lean-forward="false" selection-start-line="41" selection-start-column="21" selection-end-line="41" selection-end-column="21" />
<state relative-caret-position="-144">
<caret line="44" column="62" lean-forward="false" selection-start-line="44" selection-start-column="62" selection-end-line="44" selection-end-column="62" />
<folding />
</state>
</provider>
@@ -75,6 +65,7 @@
<component name="IdeDocumentHistory">
<option name="CHANGED_PATHS">
<list>
<option value="$PROJECT_DIR$/src/main/scala/XMLParser.scala" />
<option value="$PROJECT_DIR$/src/main/scala/KMeans.scala" />
</list>
</option>
@@ -86,10 +77,10 @@
<sorting>DEFINITION_ORDER</sorting>
</component>
<component name="ProjectFrameBounds">
<option name="x" value="49" />
<option name="y" value="63" />
<option name="x" value="75" />
<option name="y" value="52" />
<option name="width" value="1605" />
<option name="height" value="968" />
<option name="height" value="893" />
</component>
<component name="ProjectLevelVcsManager" settingsEditedManually="true" />
<component name="ProjectView">
@@ -107,6 +98,7 @@
<foldersAlwaysOnTop value="true" />
</navigator>
<panes>
<pane id="Scratches" />
<pane id="ProjectPane">
<subPane>
<PATH>
@@ -143,9 +135,8 @@
</PATH>
</subPane>
</pane>
<pane id="PackagesPane" />
<pane id="Scratches" />
<pane id="Scope" />
<pane id="PackagesPane" />
</panes>
</component>
<component name="PropertiesComponent">
@@ -263,15 +254,17 @@
<updated>1481992380230</updated>
<workItem from="1481992381575" duration="8284000" />
<workItem from="1482003919097" duration="1224000" />
<workItem from="1482071638456" duration="1188000" />
<workItem from="1482071638456" duration="2869000" />
<workItem from="1482150691487" duration="1272000" />
<workItem from="1482152647060" duration="2108000" />
</task>
<servers />
</component>
<component name="TimeTrackingManager">
<option name="totallyTimeSpent" value="10696000" />
<option name="totallyTimeSpent" value="15757000" />
</component>
<component name="ToolWindowManager">
<frame x="49" y="63" width="1605" height="968" extended-state="0" />
<frame x="75" y="52" width="1605" height="893" extended-state="0" />
<editor active="true" />
<layout>
<window_info id="Palette" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
@@ -283,8 +276,8 @@
<window_info id="Capture Analysis" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
<window_info id="Event Log" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.32948583" sideWeight="0.5" order="7" side_tool="true" content_ui="tabs" />
<window_info id="Maven Projects" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
<window_info id="Run" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" />
<window_info id="Version Control" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="7" side_tool="false" content_ui="tabs" />
<window_info id="Run" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" />
<window_info id="Properties" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
<window_info id="Terminal" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="7" side_tool="false" content_ui="tabs" />
<window_info id="Capture Tool" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" />
@@ -295,8 +288,8 @@
<window_info id="Ant Build" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
<window_info id="UI Designer" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" />
<window_info id="Theme Preview" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
<window_info id="Debug" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
<window_info id="Favorites" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="2" side_tool="true" content_ui="tabs" />
<window_info id="Debug" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
<window_info id="Cvs" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="4" side_tool="false" content_ui="tabs" />
<window_info id="Message" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
<window_info id="Commander" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
@@ -320,6 +313,58 @@
<option name="FILTER_TARGETS" value="false" />
</component>
<component name="editorHistoryManager">
<entry file="file://$PROJECT_DIR$/src/main/scala/KMeans.scala">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1818">
<caret line="104" column="13" lean-forward="false" selection-start-line="104" selection-start-column="13" selection-end-line="104" selection-end-column="13" />
<folding>
<element signature="e#23#54#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/main/scala/Main.scala">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="360">
<caret line="27" column="34" lean-forward="true" selection-start-line="27" selection-start-column="34" selection-end-line="27" selection-end-column="34" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/main/scala/XMLParser.scala">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="666">
<caret line="44" column="62" lean-forward="false" selection-start-line="44" selection-start-column="62" selection-end-line="44" selection-end-column="62" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/main/scala/KMeans.scala">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="0">
<caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
<folding>
<element signature="e#23#54#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/main/scala/Main.scala">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="468">
<caret line="26" column="0" lean-forward="true" selection-start-line="26" selection-start-column="0" selection-end-line="26" selection-end-column="0" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/main/scala/XMLParser.scala">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="324">
<caret line="25" column="8" lean-forward="true" selection-start-line="25" selection-start-column="8" selection-end-line="25" selection-end-column="8" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/main/scala/KMeans.scala">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="936">
@@ -334,7 +379,6 @@
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="16632">
<caret line="924" column="5" lean-forward="true" selection-start-line="924" selection-start-column="5" selection-end-line="924" selection-end-column="5" />
<folding />
</state>
</provider>
</entry>
@@ -342,7 +386,6 @@
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="30096">
<caret line="1672" column="22" lean-forward="false" selection-start-line="1672" selection-start-column="22" selection-end-line="1672" selection-end-column="22" />
<folding />
</state>
</provider>
</entry>
@@ -376,7 +419,6 @@
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="30096">
<caret line="1672" column="22" lean-forward="true" selection-start-line="1672" selection-start-column="22" selection-end-line="1672" selection-end-column="22" />
<folding />
</state>
</provider>
</entry>
@@ -410,35 +452,10 @@
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/main/scala/XMLParser.scala">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="612">
<caret line="41" column="21" lean-forward="false" selection-start-line="41" selection-start-column="21" selection-end-line="41" selection-end-column="21" />
<folding />
</state>
</provider>
</entry>
<entry file="jar://$MAVEN_REPOSITORY$/org/apache/spark/spark-core_2.10/1.6.0/spark-core_2.10-1.6.0-sources.jar!/org/apache/spark/rdd/RDD.scala">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="18">
<caret line="924" column="5" lean-forward="false" selection-start-line="924" selection-start-column="5" selection-end-line="924" selection-end-column="5" />
<folding />
</state>
</provider>
</entry>
<entry file="jar://$MAVEN_REPOSITORY$/org/apache/spark/spark-sql_2.10/1.6.0/spark-sql_2.10-1.6.0-sources.jar!/org/apache/spark/sql/DataFrame.scala">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="18">
<caret line="1672" column="22" lean-forward="false" selection-start-line="1672" selection-start-column="22" selection-end-line="1672" selection-end-column="22" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/main/scala/Main.scala">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="144">
<caret line="16" column="51" lean-forward="false" selection-start-line="16" selection-start-column="51" selection-end-line="16" selection-end-column="51" />
<folding />
</state>
</provider>
</entry>
@@ -446,16 +463,43 @@
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-2214">
<caret line="42" column="58" lean-forward="true" selection-start-line="42" selection-start-column="58" selection-end-line="42" selection-end-column="58" />
<folding>
<element signature="e#576#614#0" expanded="false" />
</folding>
</state>
</provider>
</entry>
<entry file="jar://$MAVEN_REPOSITORY$/org/apache/spark/spark-sql_2.10/1.6.0/spark-sql_2.10-1.6.0-sources.jar!/org/apache/spark/sql/DataFrame.scala">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="287">
<caret line="328" column="6" lean-forward="false" selection-start-line="328" selection-start-column="6" selection-end-line="328" selection-end-column="6" />
</state>
</provider>
</entry>
<entry file="jar://$MAVEN_REPOSITORY$/org/apache/spark/spark-sql_2.10/1.6.0/spark-sql_2.10-1.6.0-sources.jar!/org/apache/spark/sql/SQLContext.scala">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="253">
<caret line="486" column="6" lean-forward="false" selection-start-line="486" selection-start-column="6" selection-end-line="486" selection-end-column="6" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/main/scala/Main.scala">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="360">
<caret line="27" column="34" lean-forward="false" selection-start-line="27" selection-start-column="34" selection-end-line="27" selection-end-column="34" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/main/scala/XMLParser.scala">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-144">
<caret line="44" column="62" lean-forward="false" selection-start-line="44" selection-start-column="62" selection-end-line="44" selection-end-column="62" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/main/scala/KMeans.scala">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1152">
<caret line="64" column="0" lean-forward="true" selection-start-line="64" selection-start-column="0" selection-end-line="64" selection-end-column="0" />
<state relative-caret-position="491">
<caret line="52" column="0" lean-forward="true" selection-start-line="52" selection-start-column="0" selection-end-line="52" selection-end-column="0" />
<folding>
<element signature="e#23#54#0" expanded="true" />
</folding>
+77 -39
View File
@@ -12,9 +12,9 @@ object KMeans {
def train(dataset : DataFrame, iterations:Int) : Unit = {
val K = 10
val m = 4
val K = 10 // Number of desired clusters
val relevantData = dataset.select("Reputation", "Views", "UpVotes", "DownVotes")
val m = relevantData.columns.length //number of features
val rows = relevantData.rdd
val rowsAsArray = rows.map(row => convertRow(row, m)).persist()
@@ -30,23 +30,53 @@ object KMeans {
}
var counts = Array[Int](K)
for (i <- 0 until iterations) {
centres = clustering(centres, rowsAsArray, m, K)
if (centres == null) {
val clusterMap = clustering(centres, rowsAsArray, m, K).persist()
centres = getCentres(clusterMap, m, K)
counts = getCounts(clusterMap, K)
clusterMap.unpersist()
if (centres == null || counts == null) {
println("Error, starting again")
train(dataset, iterations)
return
}
println("\niteration " + i + " :")
for (j <- 0 until K) {
println("centre " + j + " = " + centres(j).mkString("[",",","]") )
println("centre " + j + " = " + centres(j).mkString("[",",","]") + " count = " + counts(j) )
}
}
}
def clustering(centres :Array[Array[Float]], rowsAsArray : RDD[Array[Float]], m : Int, K : Int) : Array[Array[Float]] = {
/*def clustering(centres :Array[Array[Float]], rowsAsArray : RDD[Array[Float]], m : Int, K : Int) : Array[Array[Float]] = {
val clusterMap :RDD[(Int,Array[Float])]= rowsAsArray.map(row => (assignCluster(row,centres,m,K),row)).persist()
val newCentres = clusterMap.reduceByKey((a,b) => getMeanVector(a,b,m))
val arrayNewCentres = newCentres.collect()
var results = new Array[Array[Float]](K)
for ((i,x) <- arrayNewCentres) {
results(i) = x
}
//Check all results are valid (no null)
for (i <- 0 until K) {
if (results(i) == null) {
return null
}
}
return results
}*/
def clustering(centres :Array[Array[Float]], rowsAsArray : RDD[Array[Float]], m : Int, K : Int) : RDD[(Int,Array[Float])] = {
val clusterMap :RDD[(Int,Array[Float])]= rowsAsArray.map(row => (assignCluster(row,centres,m,K),row))
return clusterMap
}
def getCentres(clusterMap: RDD[(Int,Array[Float])], m: Int, K: Int) : Array[Array[Float]] = {
val newCentres = clusterMap.reduceByKey((a,b) => getMeanVector(a,b,m))
val arrayNewCentres = newCentres.collect()
@@ -63,6 +93,23 @@ object KMeans {
return results
}
def getCounts(clusterMap: RDD[(Int,Array[Float])], K: Int) : Array[Int] = {
val countable = clusterMap.map(x => (x._1, 1))
val countsWithKeys = countable.reduceByKey((a, b) => a + b).collect()
var counts = new Array[Int](K)
for ((i, x) <- countsWithKeys) {
counts(i) = x
}
for (i <- 0 until K) {
if (counts(i) == null) {
return null
}
}
return counts
}
def calculateNorm(datapoint : Array[Float], centre : Array[Float], m: Int): Double = {
@@ -74,44 +121,35 @@ object KMeans {
norm
}
def assignCluster(row : Array[Float], centres: Array[Array[Float]], m : Int, K :Int): Int = {
var smallestNorm = 999999.0
var closestCentre = 0
for (centreNumber <- 0 until K) {
//val norm = Math.abs(row - centres(centreNumber))
val norm = calculateNorm(row, centres(centreNumber), m)
if (norm < smallestNorm) {
smallestNorm = norm
closestCentre = centreNumber
def assignCluster(row : Array[Float], centres: Array[Array[Float]], m : Int, K :Int): Int = {
var smallestNorm = 999999.0
var closestCentre = 0
for (centreNumber <- 0 until K) {
//val norm = Math.abs(row - centres(centreNumber))
val norm = calculateNorm(row, centres(centreNumber), m)
if (norm < smallestNorm) {
smallestNorm = norm
closestCentre = centreNumber
}
}
closestCentre
}
closestCentre
}
def averageRow(a:List[Float], b:List[Float]) : List[Float] = {
val means = new ArrayBuffer[Float]
for (i <- 0 until a.size) {
val mean = (a(i) + b(i)) /2.0f
means(i) = mean
def getMeanVector(a: Array[Float], b: Array[Float], m: Int) : Array[Float] = {
var means = new Array[Float](m)
for (i <- 0 until m) {
means(i) = (a(i) + b(i)) / 2
}
means
}
return means.toList
}
def getMeanVector(a: Array[Float], b: Array[Float], m: Int) : Array[Float] = {
var means = new Array[Float](m)
for (i <- 0 until m) {
means(i) = (a(i) + b(i)) / 2
def convertRow(row : Row, m: Int) : Array[Float] = {
var dataArray = new Array[Float](m)
for (i <- 0 until m) {
dataArray(i) = row.getInt(i).toFloat
}
dataArray
}
means
}
def convertRow(row : Row, m: Int) : Array[Float] = {
var dataArray = new Array[Float](m)
for (i <- 0 until m) {
dataArray(i) = row.getInt(i).toFloat
}
dataArray
}
}
+3 -3
View File
@@ -124,13 +124,13 @@ object XMLParser {
// In this case, return a placeholder value of -1.
case e: Exception => return -1
}
case DateType => return attribute
/*// If the string is a date, convert from date string to long.
case DateType =>
// If the string is a date, convert from date string to long.
var format = new java.text.SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS")
var longTime = format.parse(attribute).getTime()
// Then convert long to int representing days since epoch
var longDays : Long = longTime / (1000*60*60*24)
return longDays.toInt*/
return longDays.toInt
}
}