Merge branch 'joe-dev2' of https://github.com/Pezz89/Big_Data_Assignment_2 into joe-dev2

2016-12-16 13:01:11 +00:00
parent 81a228a28f 2eaeb5584c
commit a9231d4329
8 changed files with 68 additions and 15 deletions
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<classpath>
+	<classpathentry kind="src" output="target/classes" path="src/main/java">
+		<attributes>
+			<attribute name="optional" value="true"/>
+			<attribute name="maven.pomderived" value="true"/>
+		</attributes>
+	</classpathentry>
+	<classpathentry kind="src" output="target/test-classes" path="src/test/java">
+		<attributes>
+			<attribute name="optional" value="true"/>
+			<attribute name="maven.pomderived" value="true"/>
+		</attributes>
+	</classpathentry>
+	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.6">
+		<attributes>
+			<attribute name="maven.pomderived" value="true"/>
+		</attributes>
+	</classpathentry>
+	<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER">
+		<attributes>
+			<attribute name="maven.pomderived" value="true"/>
+		</attributes>
+	</classpathentry>
+	<classpathentry kind="output" path="target/classes"/>
+</classpath>
@@ -2,7 +2,7 @@
 bin/
 target/
 .idea/
-stackoverflow_data/
+stackoverflow_dataset/
 # Compiled Object files
 *.slo
 *.lo
@@ -0,0 +1,23 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+	<name>Big_Data_Assignment_2</name>
+	<comment></comment>
+	<projects>
+	</projects>
+	<buildSpec>
+		<buildCommand>
+			<name>org.eclipse.jdt.core.javabuilder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+		<buildCommand>
+			<name>org.eclipse.m2e.core.maven2Builder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+	</buildSpec>
+	<natures>
+		<nature>org.eclipse.jdt.core.javanature</nature>
+		<nature>org.eclipse.m2e.core.maven2Nature</nature>
+	</natures>
+</projectDescription>
@@ -0,0 +1,2 @@
+eclipse.preferences.version=1
+encoding/<project>=UTF-8
@@ -0,0 +1,5 @@
+eclipse.preferences.version=1
+org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6
+org.eclipse.jdt.core.compiler.compliance=1.6
+org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
+org.eclipse.jdt.core.compiler.source=1.6
@@ -12,12 +12,13 @@ object KMeans {
   //Create a map to store each data row with its closest cluster index as key

  def train(dataset : DataFrame) : RDD[(Int,List[Float])] = {
-     val relevantData = dataset.select("Reputation", "CreationDate", "LastAccessDate")
+     val relevantData = dataset.select("Reputation")
    val rows = relevantData.rdd
-    val rowsAsArray = rows.map(row => List(row.getInt(0).toFloat, row.getInt(1).toFloat, row.getInt(2).toFloat) )
+    //val rowsAsArray = rows.map(row => List(row.getInt(0).toFloat, row.getInt(1).toFloat, row.getInt(2).toFloat) )
+    val rowsAsArray = rows.map(row => List(row.getInt(0).toFloat) )
    val K = 5 //number of intended clusters
    //val n = rows.count() //number of datapoints
-    val m = 3 //number of features
+    val m = 1 //number of features
    //var centres = new ArrayBuffer[Row]

    //get random number generator r and use to select K centres randomly from dataset
@@ -28,7 +29,8 @@ object KMeans {
      centres(a) = rows(r.ne
    }*/
    //val centres = rowsAsArray.takeSample(false, K, System.nanoTime().toInt)
-     val centres : Array[List[Float]] = Array(List(0.0f, 0.0f, 0.0f), List(10.0f, 10.0f, 10.0f), List(20.0f, 20.0f, 20.0f))
+     //val centres : Array[List[Float]] = Array(List(0.0f, 0.0f, 0.0f), List(10.0f, 10.0f, 10.0f), List(20.0f, 20.0f, 20.0f))
+     val centres : Array[List[Float]] = Array(List(0.0f), List(0.0f), List(0.0f), List(0.0f), List(0.0f))
     val clusterMap :RDD[(Int,List[Float])]= rowsAsArray.map(row => (assignCluster(row,centres,m,K),row))
     val newCentres = calculateNewCentres(clusterMap)
     newCentres
@@ -45,7 +47,7 @@ object KMeans {
  }

  def assignCluster(row : List[Float], centres: Array[List[Float]], m : Int, K :Int): Int = {
-    var smallestNorm = 99999999999.0
+    var smallestNorm = 999999.0
    var closestCentre = 0
    for (centreNumber <- 0 until K) {
      val norm = calculateNorm(row, centres(centreNumber), m)
@@ -36,7 +36,7 @@ object Main {
  }
 }

-    /*val users = dataFrames("users")
+    //val users = dataFrames("users")

    /*val dataFrames = DataParser.ParseData()

@@ -54,8 +54,3 @@ object Main {
 */
    // Info on using DataFrames here: https://www.mapr.com/blog/using-apache-spark-dataframes-processing-tabular-data

-
-
-  }
-}
-*/
@@ -124,13 +124,13 @@ object XMLParser {
          // In this case, return a placeholder value of -1.
          case e: Exception => return -1
        }
-      case DateType => 
-        // If the string is a date, convert from date string to long.
+      case DateType => return attribute
+        /*// If the string is a date, convert from date string to long.
        var format = new java.text.SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS")
        var longTime = format.parse(attribute).getTime()
        // Then convert long to int representing days since epoch
        var longDays : Long = longTime / (1000*60*60*24)
-        return longDays.toInt
+        return longDays.toInt*/
    }
  }