diff --git a/.classpath b/.classpath
new file mode 100644
index 0000000..fd7ad7f
--- /dev/null
+++ b/.classpath
@@ -0,0 +1,26 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/.project b/.project
new file mode 100644
index 0000000..fa02919
--- /dev/null
+++ b/.project
@@ -0,0 +1,23 @@
+
+
+ Big_Data_Assignment_2
+
+
+
+
+
+ org.eclipse.jdt.core.javabuilder
+
+
+
+
+ org.eclipse.m2e.core.maven2Builder
+
+
+
+
+
+ org.eclipse.jdt.core.javanature
+ org.eclipse.m2e.core.maven2Nature
+
+
diff --git a/.settings/org.eclipse.core.resources.prefs b/.settings/org.eclipse.core.resources.prefs
new file mode 100644
index 0000000..99f26c0
--- /dev/null
+++ b/.settings/org.eclipse.core.resources.prefs
@@ -0,0 +1,2 @@
+eclipse.preferences.version=1
+encoding/=UTF-8
diff --git a/.settings/org.eclipse.jdt.core.prefs b/.settings/org.eclipse.jdt.core.prefs
new file mode 100644
index 0000000..60105c1
--- /dev/null
+++ b/.settings/org.eclipse.jdt.core.prefs
@@ -0,0 +1,5 @@
+eclipse.preferences.version=1
+org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6
+org.eclipse.jdt.core.compiler.compliance=1.6
+org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
+org.eclipse.jdt.core.compiler.source=1.6
diff --git a/src/main/scala/KMeans.scala b/src/main/scala/KMeans.scala
index e0ae53e..050d53f 100644
--- a/src/main/scala/KMeans.scala
+++ b/src/main/scala/KMeans.scala
@@ -12,12 +12,13 @@ object KMeans {
//Create a map to store each data row with its closest cluster index as key
def train(dataset : DataFrame) : RDD[(Int,List[Float])] = {
- val relevantData = dataset.select("Reputation", "CreationDate", "LastAccessDate")
+ val relevantData = dataset.select("Reputation")
val rows = relevantData.rdd
- val rowsAsArray = rows.map(row => List(row.getInt(0).toFloat, row.getInt(1).toFloat, row.getInt(2).toFloat) )
+ //val rowsAsArray = rows.map(row => List(row.getInt(0).toFloat, row.getInt(1).toFloat, row.getInt(2).toFloat) )
+ val rowsAsArray = rows.map(row => List(row.getInt(0).toFloat) )
val K = 5 //number of intended clusters
//val n = rows.count() //number of datapoints
- val m = 3 //number of features
+ val m = 1 //number of features
//var centres = new ArrayBuffer[Row]
//get random number generator r and use to select K centres randomly from dataset
@@ -28,7 +29,8 @@ object KMeans {
centres(a) = rows(r.ne
}*/
//val centres = rowsAsArray.takeSample(false, K, System.nanoTime().toInt)
- val centres : Array[List[Float]] = Array(List(0.0f, 0.0f, 0.0f), List(10.0f, 10.0f, 10.0f), List(20.0f, 20.0f, 20.0f))
+ //val centres : Array[List[Float]] = Array(List(0.0f, 0.0f, 0.0f), List(10.0f, 10.0f, 10.0f), List(20.0f, 20.0f, 20.0f))
+ val centres : Array[List[Float]] = Array(List(0.0f), List(0.0f), List(0.0f), List(0.0f), List(0.0f))
val clusterMap :RDD[(Int,List[Float])]= rowsAsArray.map(row => (assignCluster(row,centres,m,K),row))
val newCentres = calculateNewCentres(clusterMap)
newCentres
@@ -45,7 +47,7 @@ object KMeans {
}
def assignCluster(row : List[Float], centres: Array[List[Float]], m : Int, K :Int): Int = {
- var smallestNorm = 99999999999.0
+ var smallestNorm = 999999.0
var closestCentre = 0
for (centreNumber <- 0 until K) {
val norm = calculateNorm(row, centres(centreNumber), m)
diff --git a/src/main/scala/Main.scala b/src/main/scala/Main.scala
index a5a3b76..ec2ca0c 100644
--- a/src/main/scala/Main.scala
+++ b/src/main/scala/Main.scala
@@ -32,8 +32,10 @@ object Main {
val centresArray = centres.collect()
val unwrap = centresArray.map(x => x._2)
unwrap.foreach(println)
+ }
+}
- /*val users = dataFrames("users")
+ //val users = dataFrames("users")
/*val dataFrames = DataParser.ParseData()
@@ -52,7 +54,3 @@ object Main {
// Info on using DataFrames here: https://www.mapr.com/blog/using-apache-spark-dataframes-processing-tabular-data
-
- }
-}
-
diff --git a/src/main/scala/XMLParser.scala b/src/main/scala/XMLParser.scala
index dd5cf3c..94e121a 100644
--- a/src/main/scala/XMLParser.scala
+++ b/src/main/scala/XMLParser.scala
@@ -124,13 +124,13 @@ object XMLParser {
// In this case, return a placeholder value of -1.
case e: Exception => return -1
}
- case DateType =>
- // If the string is a date, convert from date string to long.
+ case DateType => return attribute
+ /*// If the string is a date, convert from date string to long.
var format = new java.text.SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS")
var longTime = format.parse(attribute).getTime()
// Then convert long to int representing days since epoch
var longDays : Long = longTime / (1000*60*60*24)
- return longDays.toInt
+ return longDays.toInt*/
}
}