generalise features
This commit is contained in:
Generated
+50
-24
@@ -21,8 +21,8 @@
|
||||
<file leaf-file-name="KMeans.scala" pinned="false" current-in-tab="true">
|
||||
<entry file="file://$PROJECT_DIR$/src/main/scala/KMeans.scala">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="288">
|
||||
<caret line="40" column="0" lean-forward="true" selection-start-line="40" selection-start-column="0" selection-end-line="40" selection-end-column="0" />
|
||||
<state relative-caret-position="3168">
|
||||
<caret line="176" column="3" lean-forward="true" selection-start-line="176" selection-start-column="3" selection-end-line="176" selection-end-column="3" />
|
||||
<folding>
|
||||
<element signature="e#23#54#0" expanded="true" />
|
||||
</folding>
|
||||
@@ -30,6 +30,16 @@
|
||||
</provider>
|
||||
</entry>
|
||||
</file>
|
||||
<file leaf-file-name="DataFrame.scala" pinned="false" current-in-tab="false">
|
||||
<entry file="jar://$MAVEN_REPOSITORY$/org/apache/spark/spark-sql_2.10/1.6.0/spark-sql_2.10-1.6.0-sources.jar!/org/apache/spark/sql/DataFrame.scala">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="307">
|
||||
<caret line="1672" column="22" lean-forward="true" selection-start-line="1672" selection-start-column="22" selection-end-line="1672" selection-end-column="22" />
|
||||
<folding />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
</file>
|
||||
<file leaf-file-name="Main.scala" pinned="false" current-in-tab="false">
|
||||
<entry file="file://$PROJECT_DIR$/src/main/scala/Main.scala">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
@@ -43,8 +53,8 @@
|
||||
<file leaf-file-name="XMLParser.scala" pinned="false" current-in-tab="false">
|
||||
<entry file="file://$PROJECT_DIR$/src/main/scala/XMLParser.scala">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="144">
|
||||
<caret line="15" column="7" lean-forward="false" selection-start-line="15" selection-start-column="7" selection-end-line="15" selection-end-column="7" />
|
||||
<state relative-caret-position="394">
|
||||
<caret line="41" column="21" lean-forward="true" selection-start-line="41" selection-start-column="21" selection-end-line="41" selection-end-column="21" />
|
||||
<folding />
|
||||
</state>
|
||||
</provider>
|
||||
@@ -245,12 +255,12 @@
|
||||
<option name="number" value="Default" />
|
||||
<option name="presentableId" value="Default" />
|
||||
<updated>1481992380230</updated>
|
||||
<workItem from="1481992381575" duration="4864000" />
|
||||
<workItem from="1481992381575" duration="7661000" />
|
||||
</task>
|
||||
<servers />
|
||||
</component>
|
||||
<component name="TimeTrackingManager">
|
||||
<option name="totallyTimeSpent" value="4864000" />
|
||||
<option name="totallyTimeSpent" value="7661000" />
|
||||
</component>
|
||||
<component name="ToolWindowManager">
|
||||
<frame x="65" y="24" width="1615" height="1026" extended-state="6" />
|
||||
@@ -302,14 +312,6 @@
|
||||
<option name="FILTER_TARGETS" value="false" />
|
||||
</component>
|
||||
<component name="editorHistoryManager">
|
||||
<entry file="file://$PROJECT_DIR$/src/main/scala/Main.scala">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="414">
|
||||
<caret line="33" column="27" lean-forward="true" selection-start-line="33" selection-start-column="27" selection-end-line="33" selection-end-column="27" />
|
||||
<folding />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="jar://$MAVEN_REPOSITORY$/org/scala-lang/scala-library/2.10.5/scala-library-2.10.5-sources.jar!/scala/Array.scala">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="8928">
|
||||
@@ -320,14 +322,6 @@
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/src/main/scala/XMLParser.scala">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="144">
|
||||
<caret line="15" column="7" lean-forward="false" selection-start-line="15" selection-start-column="7" selection-end-line="15" selection-end-column="7" />
|
||||
<folding />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="jar://$MAVEN_REPOSITORY$/org/apache/spark/spark-core_2.10/1.6.0/spark-core_2.10-1.6.0-sources.jar!/org/apache/spark/rdd/RDD.scala">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="307">
|
||||
@@ -336,10 +330,42 @@
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/src/main/scala/Main.scala">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="414">
|
||||
<caret line="33" column="27" lean-forward="true" selection-start-line="33" selection-start-column="27" selection-end-line="33" selection-end-column="27" />
|
||||
<folding />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="jar://$MAVEN_REPOSITORY$/org/apache/spark/spark-catalyst_2.10/1.6.0/spark-catalyst_2.10-1.6.0-sources.jar!/org/apache/spark/sql/Row.scala">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="1926">
|
||||
<caret line="127" column="17" lean-forward="true" selection-start-line="127" selection-start-column="17" selection-end-line="127" selection-end-column="17" />
|
||||
<folding />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="jar://$MAVEN_REPOSITORY$/org/apache/spark/spark-sql_2.10/1.6.0/spark-sql_2.10-1.6.0-sources.jar!/org/apache/spark/sql/DataFrame.scala">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="307">
|
||||
<caret line="1672" column="22" lean-forward="true" selection-start-line="1672" selection-start-column="22" selection-end-line="1672" selection-end-column="22" />
|
||||
<folding />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/src/main/scala/XMLParser.scala">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="394">
|
||||
<caret line="41" column="21" lean-forward="true" selection-start-line="41" selection-start-column="21" selection-end-line="41" selection-end-column="21" />
|
||||
<folding />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/src/main/scala/KMeans.scala">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="288">
|
||||
<caret line="40" column="0" lean-forward="true" selection-start-line="40" selection-start-column="0" selection-end-line="40" selection-end-column="0" />
|
||||
<state relative-caret-position="3168">
|
||||
<caret line="176" column="3" lean-forward="true" selection-start-line="176" selection-start-column="3" selection-end-line="176" selection-end-column="3" />
|
||||
<folding>
|
||||
<element signature="e#23#54#0" expanded="true" />
|
||||
</folding>
|
||||
|
||||
+38
-17
@@ -13,24 +13,28 @@ object KMeans {
|
||||
|
||||
|
||||
def train(dataset : DataFrame, iterations:Int) : Unit = {
|
||||
val relevantData = dataset.select("Reputation")
|
||||
val K = 4
|
||||
val m = 2
|
||||
val relevantData = dataset.select("Reputation", "Views")
|
||||
val rows = relevantData.rdd
|
||||
//val rowsAsArray = rows.map(row => List(row.getInt(0).toFloat, row.getInt(1).toFloat, row.getInt(2).toFloat) )
|
||||
val rowsAsArray = rows.map(row => row.getInt(0).toFloat)
|
||||
val rowsAsArray = rows.map(row => convertRow(row, m))
|
||||
//val maximum = rowsAsArray.reduce((a, b) => math.max(a, b))
|
||||
//println(maximum)
|
||||
//rowsAsArray.foreach(println)
|
||||
val K = 4
|
||||
|
||||
//number of intended clusters
|
||||
//val n = rows.count() //number of datapoints
|
||||
val m = 1 //number of features
|
||||
//number of features
|
||||
//var centres = new ArrayBuffer[Float]()
|
||||
|
||||
var centres: Array[Float] = rowsAsArray.takeSample(false, K, System.nanoTime().toInt)
|
||||
//To reduce chance of two random centres being the same, add i to each
|
||||
var centres: Array[Array[Float]] = rowsAsArray.takeSample(false, K, System.nanoTime().toInt)
|
||||
//To reduce chance of two random centres being the same, add a changing value to each
|
||||
println("centres initialised")
|
||||
for (i <- 0 until K) {
|
||||
centres(i) += i
|
||||
for (j <- 0 until m) {
|
||||
centres(i)(j) += i+j
|
||||
}
|
||||
println("centre " + i + " = " + centres(i) )
|
||||
}
|
||||
|
||||
@@ -56,9 +60,9 @@ object KMeans {
|
||||
}
|
||||
|
||||
|
||||
def clustering(centres :Array[Float], rowsAsArray : RDD[Float], m : Int, K : Int) : Array[Float] = {
|
||||
val clusterMap :RDD[(Int,Float)]= rowsAsArray.map(row => (assignCluster(row,centres,m,K),row))
|
||||
val newCentres = clusterMap.reduceByKey((a,b) => getAverage(a,b))
|
||||
def clustering(centres :Array[Array[Float]], rowsAsArray : RDD[Array[Float]], m : Int, K : Int) : Array[Array[Float]] = {
|
||||
val clusterMap :RDD[(Int,Array[Float])]= rowsAsArray.map(row => (assignCluster(row,centres,m,K),row))
|
||||
val newCentres = clusterMap.reduceByKey((a,b) => getMeanVector(a,b,m))
|
||||
|
||||
val results = newCentres.map(x => x._2)
|
||||
results.collect()
|
||||
@@ -84,22 +88,22 @@ object KMeans {
|
||||
|
||||
|
||||
|
||||
/*
|
||||
def calculateNorm(datapoint : List[Float], centre : List[Float], m: Int): Double = {
|
||||
|
||||
def calculateNorm(datapoint : Array[Float], centre : Array[Float], m: Int): Double = {
|
||||
var norm : Double = 0.0
|
||||
for (a <- 0 until m) {
|
||||
norm = norm + Math.pow(datapoint(a) - centre(a), 2.0)
|
||||
}
|
||||
norm = Math.pow(norm, 0.5)
|
||||
norm
|
||||
}*/
|
||||
}
|
||||
|
||||
def assignCluster(row : Float, centres: Array[Float], m : Int, K :Int): Int = {
|
||||
def assignCluster(row : Array[Float], centres: Array[Array[Float]], m : Int, K :Int): Int = {
|
||||
var smallestNorm = 999999.0
|
||||
var closestCentre = 0
|
||||
for (centreNumber <- 0 until K) {
|
||||
val norm = Math.abs(row - centres(centreNumber))
|
||||
//val norm = calculateNorm(row, centres(centreNumber), m)
|
||||
//val norm = Math.abs(row - centres(centreNumber))
|
||||
val norm = calculateNorm(row, centres(centreNumber), m)
|
||||
if (norm < smallestNorm) {
|
||||
smallestNorm = norm
|
||||
closestCentre = centreNumber
|
||||
@@ -142,6 +146,7 @@ object KMeans {
|
||||
newRow
|
||||
}*/
|
||||
|
||||
|
||||
def averageRow(a:List[Float], b:List[Float]) : List[Float] = {
|
||||
val means = new ArrayBuffer[Float]
|
||||
for (i <- 0 until a.size) {
|
||||
@@ -151,8 +156,24 @@ object KMeans {
|
||||
return means.toList
|
||||
}
|
||||
|
||||
def getAverage(a: Float, b:Float) : Float = {
|
||||
/*def getAverage(a: Float, b:Float) : Float = {
|
||||
return ((a+b)/2)
|
||||
}*/
|
||||
|
||||
def getMeanVector(a: Array[Float], b: Array[Float], m: Int) : Array[Float] = {
|
||||
var means = new Array[Float](m)
|
||||
for (i <- 0 until m) {
|
||||
means(m) = (a(m) + b(m)) / 2
|
||||
}
|
||||
means
|
||||
}
|
||||
|
||||
def convertRow(row : Row, m: Int) : Array[Float] = {
|
||||
var dataArray = new Array[Float](m)
|
||||
for (i <- 0 until m) {
|
||||
dataArray(m) = row.getInt(m).toFloat
|
||||
}
|
||||
dataArray
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user