From 2eaeb5584c603403e951b99c94874c5b045414bc Mon Sep 17 00:00:00 2001 From: Joe Darby Date: Fri, 16 Dec 2016 13:04:57 +0000 Subject: [PATCH] one feature --- .classpath | 26 ++++++++++++++++++++++ .project | 23 +++++++++++++++++++ .settings/org.eclipse.core.resources.prefs | 2 ++ .settings/org.eclipse.jdt.core.prefs | 5 +++++ src/main/scala/KMeans.scala | 12 +++++----- src/main/scala/Main.scala | 8 +++---- src/main/scala/XMLParser.scala | 6 ++--- 7 files changed, 69 insertions(+), 13 deletions(-) create mode 100644 .classpath create mode 100644 .project create mode 100644 .settings/org.eclipse.core.resources.prefs create mode 100644 .settings/org.eclipse.jdt.core.prefs diff --git a/.classpath b/.classpath new file mode 100644 index 0000000..fd7ad7f --- /dev/null +++ b/.classpath @@ -0,0 +1,26 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.project b/.project new file mode 100644 index 0000000..fa02919 --- /dev/null +++ b/.project @@ -0,0 +1,23 @@ + + + Big_Data_Assignment_2 + + + + + + org.eclipse.jdt.core.javabuilder + + + + + org.eclipse.m2e.core.maven2Builder + + + + + + org.eclipse.jdt.core.javanature + org.eclipse.m2e.core.maven2Nature + + diff --git a/.settings/org.eclipse.core.resources.prefs b/.settings/org.eclipse.core.resources.prefs new file mode 100644 index 0000000..99f26c0 --- /dev/null +++ b/.settings/org.eclipse.core.resources.prefs @@ -0,0 +1,2 @@ +eclipse.preferences.version=1 +encoding/=UTF-8 diff --git a/.settings/org.eclipse.jdt.core.prefs b/.settings/org.eclipse.jdt.core.prefs new file mode 100644 index 0000000..60105c1 --- /dev/null +++ b/.settings/org.eclipse.jdt.core.prefs @@ -0,0 +1,5 @@ +eclipse.preferences.version=1 +org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6 +org.eclipse.jdt.core.compiler.compliance=1.6 +org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning +org.eclipse.jdt.core.compiler.source=1.6 diff --git a/src/main/scala/KMeans.scala b/src/main/scala/KMeans.scala index e0ae53e..050d53f 100644 --- a/src/main/scala/KMeans.scala +++ b/src/main/scala/KMeans.scala @@ -12,12 +12,13 @@ object KMeans { //Create a map to store each data row with its closest cluster index as key def train(dataset : DataFrame) : RDD[(Int,List[Float])] = { - val relevantData = dataset.select("Reputation", "CreationDate", "LastAccessDate") + val relevantData = dataset.select("Reputation") val rows = relevantData.rdd - val rowsAsArray = rows.map(row => List(row.getInt(0).toFloat, row.getInt(1).toFloat, row.getInt(2).toFloat) ) + //val rowsAsArray = rows.map(row => List(row.getInt(0).toFloat, row.getInt(1).toFloat, row.getInt(2).toFloat) ) + val rowsAsArray = rows.map(row => List(row.getInt(0).toFloat) ) val K = 5 //number of intended clusters //val n = rows.count() //number of datapoints - val m = 3 //number of features + val m = 1 //number of features //var centres = new ArrayBuffer[Row] //get random number generator r and use to select K centres randomly from dataset @@ -28,7 +29,8 @@ object KMeans { centres(a) = rows(r.ne }*/ //val centres = rowsAsArray.takeSample(false, K, System.nanoTime().toInt) - val centres : Array[List[Float]] = Array(List(0.0f, 0.0f, 0.0f), List(10.0f, 10.0f, 10.0f), List(20.0f, 20.0f, 20.0f)) + //val centres : Array[List[Float]] = Array(List(0.0f, 0.0f, 0.0f), List(10.0f, 10.0f, 10.0f), List(20.0f, 20.0f, 20.0f)) + val centres : Array[List[Float]] = Array(List(0.0f), List(0.0f), List(0.0f), List(0.0f), List(0.0f)) val clusterMap :RDD[(Int,List[Float])]= rowsAsArray.map(row => (assignCluster(row,centres,m,K),row)) val newCentres = calculateNewCentres(clusterMap) newCentres @@ -45,7 +47,7 @@ object KMeans { } def assignCluster(row : List[Float], centres: Array[List[Float]], m : Int, K :Int): Int = { - var smallestNorm = 99999999999.0 + var smallestNorm = 999999.0 var closestCentre = 0 for (centreNumber <- 0 until K) { val norm = calculateNorm(row, centres(centreNumber), m) diff --git a/src/main/scala/Main.scala b/src/main/scala/Main.scala index a5a3b76..ec2ca0c 100644 --- a/src/main/scala/Main.scala +++ b/src/main/scala/Main.scala @@ -32,8 +32,10 @@ object Main { val centresArray = centres.collect() val unwrap = centresArray.map(x => x._2) unwrap.foreach(println) + } +} - /*val users = dataFrames("users") + //val users = dataFrames("users") /*val dataFrames = DataParser.ParseData() @@ -52,7 +54,3 @@ object Main { // Info on using DataFrames here: https://www.mapr.com/blog/using-apache-spark-dataframes-processing-tabular-data - - } -} - diff --git a/src/main/scala/XMLParser.scala b/src/main/scala/XMLParser.scala index dd5cf3c..94e121a 100644 --- a/src/main/scala/XMLParser.scala +++ b/src/main/scala/XMLParser.scala @@ -124,13 +124,13 @@ object XMLParser { // In this case, return a placeholder value of -1. case e: Exception => return -1 } - case DateType => - // If the string is a date, convert from date string to long. + case DateType => return attribute + /*// If the string is a date, convert from date string to long. var format = new java.text.SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS") var longTime = format.parse(attribute).getTime() // Then convert long to int representing days since epoch var longDays : Long = longTime / (1000*60*60*24) - return longDays.toInt + return longDays.toInt*/ } }