Merge branch 'joe-dev2' of https://github.com/Pezz89/Big_Data_Assignment_2 into joe-dev2

This commit is contained in:
Joe Darby
2016-12-15 23:07:44 +00:00
2 changed files with 57 additions and 47 deletions
+42 -33
View File
@@ -2,7 +2,7 @@
<project version="4">
<component name="ChangeListManager">
<list default="true" id="74fa95ce-dfd4-40da-a7a1-b336badfaea8" name="Default" comment="">
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/src/main/scala/XMLParser.scala" afterPath="$PROJECT_DIR$/src/main/scala/XMLParser.scala" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/src/main/scala/KMeans.scala" afterPath="$PROJECT_DIR$/src/main/scala/KMeans.scala" />
</list>
<ignored path="$PROJECT_DIR$/target/" />
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
@@ -18,11 +18,11 @@
<component name="ExecutionTargetManager" SELECTED_TARGET="default_target" />
<component name="FileEditorManager">
<leaf SIDE_TABS_SIZE_LIMIT_KEY="300">
<file leaf-file-name="KMeans.scala" pinned="false" current-in-tab="false">
<file leaf-file-name="KMeans.scala" pinned="false" current-in-tab="true">
<entry file="file://$PROJECT_DIR$/src/main/scala/KMeans.scala">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="509">
<caret line="39" column="8" lean-forward="true" selection-start-line="39" selection-start-column="8" selection-end-line="39" selection-end-column="8" />
<state relative-caret-position="414">
<caret line="23" column="17" lean-forward="true" selection-start-line="23" selection-start-column="17" selection-end-line="23" selection-end-column="17" />
<folding>
<element signature="e#23#54#0" expanded="true" />
</folding>
@@ -35,12 +35,15 @@
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="324">
<caret line="38" column="34" lean-forward="false" selection-start-line="38" selection-start-column="34" selection-end-line="38" selection-end-column="34" />
<folding />
<folding>
<element signature="n#!!doc" expanded="false" />
<element signature="e#832#872#0" expanded="false" />
</folding>
</state>
</provider>
</entry>
</file>
<file leaf-file-name="XMLParser.scala" pinned="false" current-in-tab="true">
<file leaf-file-name="XMLParser.scala" pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/src/main/scala/XMLParser.scala">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="441">
@@ -55,8 +58,8 @@
<file leaf-file-name="Main.scala" pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/src/main/scala/Main.scala">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="401">
<caret line="38" column="19" lean-forward="true" selection-start-line="38" selection-start-column="19" selection-end-line="38" selection-end-column="19" />
<state relative-caret-position="419">
<caret line="39" column="28" lean-forward="true" selection-start-line="39" selection-start-column="28" selection-end-line="39" selection-end-column="28" />
<folding>
<element signature="e#22#58#0" expanded="true" />
</folding>
@@ -78,8 +81,8 @@
<option name="CHANGED_PATHS">
<list>
<option value="$PROJECT_DIR$/src/main/scala/Main.scala" />
<option value="$PROJECT_DIR$/src/main/scala/KMeans.scala" />
<option value="$PROJECT_DIR$/src/main/scala/XMLParser.scala" />
<option value="$PROJECT_DIR$/src/main/scala/KMeans.scala" />
</list>
</option>
</component>
@@ -601,12 +604,12 @@
<updated>1481830590764</updated>
<workItem from="1481830593703" duration="700000" />
<workItem from="1481831304788" duration="5133000" />
<workItem from="1481837779668" duration="2057000" />
<workItem from="1481837779668" duration="4028000" />
</task>
<servers />
</component>
<component name="TimeTrackingManager">
<option name="totallyTimeSpent" value="7890000" />
<option name="totallyTimeSpent" value="9861000" />
</component>
<component name="ToolWindowManager">
<frame x="77" y="122" width="1400" height="893" extended-state="0" />
@@ -669,7 +672,10 @@
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="324">
<caret line="38" column="34" lean-forward="false" selection-start-line="38" selection-start-column="34" selection-end-line="38" selection-end-column="34" />
<folding />
<folding>
<element signature="n#!!doc" expanded="false" />
<element signature="e#832#872#0" expanded="false" />
</folding>
</state>
</provider>
</entry>
@@ -759,7 +765,10 @@
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="324">
<caret line="38" column="34" lean-forward="false" selection-start-line="38" selection-start-column="34" selection-end-line="38" selection-end-column="34" />
<folding />
<folding>
<element signature="n#!!doc" expanded="false" />
<element signature="e#832#872#0" expanded="false" />
</folding>
</state>
</provider>
</entry>
@@ -785,26 +794,6 @@
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/main/scala/Main.scala">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="401">
<caret line="38" column="19" lean-forward="true" selection-start-line="38" selection-start-column="19" selection-end-line="38" selection-end-column="19" />
<folding>
<element signature="e#22#58#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/main/scala/KMeans.scala">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="509">
<caret line="39" column="8" lean-forward="true" selection-start-line="39" selection-start-column="8" selection-end-line="39" selection-end-column="8" />
<folding>
<element signature="e#23#54#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="jar:///usr/lib/jvm/java-8-openjdk-amd64/jre/lib/rt.jar!/java/util/Date.class">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="262">
@@ -816,6 +805,16 @@
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/main/scala/Main.scala">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="419">
<caret line="39" column="28" lean-forward="true" selection-start-line="39" selection-start-column="28" selection-end-line="39" selection-end-column="28" />
<folding>
<element signature="e#22#58#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/main/scala/XMLParser.scala">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="441">
@@ -826,5 +825,15 @@
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/main/scala/KMeans.scala">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="414">
<caret line="23" column="17" lean-forward="true" selection-start-line="23" selection-start-column="17" selection-end-line="23" selection-end-column="17" />
<folding>
<element signature="e#23#54#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</component>
</project>
+15 -14
View File
@@ -2,6 +2,7 @@ package ClusterSOData
import org.apache.spark.rdd.RDD
import org.apache.spark.sql._
import scala.collection.mutable.ArrayBuffer
object KMeans {
@@ -10,10 +11,11 @@ object KMeans {
*/
//Create a map to store each data row with its closest cluster index as key
def train(dataset : DataFrame) : RDD[(Int,Row)] = {
def train(dataset : DataFrame) : RDD[(Int,List[Float])] = {
val rows = dataset.rdd
val rowsAsArray = dataset.map(row => List(row.getInt(0).toFloat, row.getInt(1).toFloat, row.getInt(2).toFloat) )
val K = 5 //number of intended clusters
val n = rows.count() //number of datapoints
//val n = rows.count() //number of datapoints
val m = 3 //number of features
//var centres = new ArrayBuffer[Row]
@@ -24,23 +26,23 @@ object KMeans {
for (a <- 0 until K) {
centres(a) = rows(r.ne
}*/
val centres = rows.takeSample(false, K, System.nanoTime().toInt)
val clusterMap :RDD[(Int,Row)]= rows.map(row => (assignCluster(row,centres,m,K),row))
val centres = rowsAsArray.takeSample(false, K, System.nanoTime().toInt)
val clusterMap :RDD[(Int,List[Float])]= rowsAsArray.map(row => (assignCluster(row,centres,m,K),row))
val newCentres = calculateNewCentres(clusterMap)
newCentres
}
def calculateNorm(datapoint : Row, centre : Row, m: Int): Double = {
def calculateNorm(datapoint : List[Float], centre : List[Float], m: Int): Double = {
var norm : Double = 0.0
for (a <- 0 to m) {
norm = norm + Math.pow(datapoint.getInt(a).toFloat - centre.getInt(a).toFloat, 2.0)
for (a <- 0 until m) {
norm = norm + Math.pow(datapoint(a) - centre(a), 2.0)
}
norm = Math.pow(norm, 0.5)
norm
}
def assignCluster(row : Row, centres: Array[Row], m : Int, K :Int): Int = {
def assignCluster(row : List[Float], centres: Array[List[Float]], m : Int, K :Int): Int = {
var smallestNorm = 99999999999.0
var closestCentre = 0
for (centreNumber <- 0 until K) {
@@ -53,7 +55,7 @@ object KMeans {
closestCentre
}
def calculateNewCentres(clusterMap : RDD[(Int,Row)]): RDD[(Int,Row)] = {
def calculateNewCentres(clusterMap : RDD[(Int,List[Float])]): RDD[(Int,List[Float])] = {
//val data = clusterMap.map(x => (x._1, x._2.asInstanceOf[ArrayBuffer[Double]]))
val newCentres = clusterMap.reduceByKey((a, b) => averageRow(a, b))
//val singleCluster = clusterMap.filter(x => x._1 == 0)
@@ -85,14 +87,13 @@ object KMeans {
newRow
}*/
def averageRow(a:Row, b:Row) : Row = {
val means = new ArrayBuffer[Double]()
def averageRow(a:List[Float], b:List[Float]) : List[Float] = {
val means = new ArrayBuffer[Float]
for (i <- 0 until a.size) {
val mean = (a.getInt(i) + b.getInt(i)) /2.0
val mean = (a(i) + b(i)) /2.0f
means(i) = mean
}
val newRow = Row.fromSeq(means)
newRow
return means.toList
}
}