one feature
This commit is contained in:
+26
@@ -0,0 +1,26 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<classpath>
|
||||||
|
<classpathentry kind="src" output="target/classes" path="src/main/java">
|
||||||
|
<attributes>
|
||||||
|
<attribute name="optional" value="true"/>
|
||||||
|
<attribute name="maven.pomderived" value="true"/>
|
||||||
|
</attributes>
|
||||||
|
</classpathentry>
|
||||||
|
<classpathentry kind="src" output="target/test-classes" path="src/test/java">
|
||||||
|
<attributes>
|
||||||
|
<attribute name="optional" value="true"/>
|
||||||
|
<attribute name="maven.pomderived" value="true"/>
|
||||||
|
</attributes>
|
||||||
|
</classpathentry>
|
||||||
|
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.6">
|
||||||
|
<attributes>
|
||||||
|
<attribute name="maven.pomderived" value="true"/>
|
||||||
|
</attributes>
|
||||||
|
</classpathentry>
|
||||||
|
<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER">
|
||||||
|
<attributes>
|
||||||
|
<attribute name="maven.pomderived" value="true"/>
|
||||||
|
</attributes>
|
||||||
|
</classpathentry>
|
||||||
|
<classpathentry kind="output" path="target/classes"/>
|
||||||
|
</classpath>
|
||||||
@@ -0,0 +1,23 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<projectDescription>
|
||||||
|
<name>Big_Data_Assignment_2</name>
|
||||||
|
<comment></comment>
|
||||||
|
<projects>
|
||||||
|
</projects>
|
||||||
|
<buildSpec>
|
||||||
|
<buildCommand>
|
||||||
|
<name>org.eclipse.jdt.core.javabuilder</name>
|
||||||
|
<arguments>
|
||||||
|
</arguments>
|
||||||
|
</buildCommand>
|
||||||
|
<buildCommand>
|
||||||
|
<name>org.eclipse.m2e.core.maven2Builder</name>
|
||||||
|
<arguments>
|
||||||
|
</arguments>
|
||||||
|
</buildCommand>
|
||||||
|
</buildSpec>
|
||||||
|
<natures>
|
||||||
|
<nature>org.eclipse.jdt.core.javanature</nature>
|
||||||
|
<nature>org.eclipse.m2e.core.maven2Nature</nature>
|
||||||
|
</natures>
|
||||||
|
</projectDescription>
|
||||||
@@ -0,0 +1,2 @@
|
|||||||
|
eclipse.preferences.version=1
|
||||||
|
encoding/<project>=UTF-8
|
||||||
@@ -0,0 +1,5 @@
|
|||||||
|
eclipse.preferences.version=1
|
||||||
|
org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6
|
||||||
|
org.eclipse.jdt.core.compiler.compliance=1.6
|
||||||
|
org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
|
||||||
|
org.eclipse.jdt.core.compiler.source=1.6
|
||||||
@@ -12,12 +12,13 @@ object KMeans {
|
|||||||
//Create a map to store each data row with its closest cluster index as key
|
//Create a map to store each data row with its closest cluster index as key
|
||||||
|
|
||||||
def train(dataset : DataFrame) : RDD[(Int,List[Float])] = {
|
def train(dataset : DataFrame) : RDD[(Int,List[Float])] = {
|
||||||
val relevantData = dataset.select("Reputation", "CreationDate", "LastAccessDate")
|
val relevantData = dataset.select("Reputation")
|
||||||
val rows = relevantData.rdd
|
val rows = relevantData.rdd
|
||||||
val rowsAsArray = rows.map(row => List(row.getInt(0).toFloat, row.getInt(1).toFloat, row.getInt(2).toFloat) )
|
//val rowsAsArray = rows.map(row => List(row.getInt(0).toFloat, row.getInt(1).toFloat, row.getInt(2).toFloat) )
|
||||||
|
val rowsAsArray = rows.map(row => List(row.getInt(0).toFloat) )
|
||||||
val K = 5 //number of intended clusters
|
val K = 5 //number of intended clusters
|
||||||
//val n = rows.count() //number of datapoints
|
//val n = rows.count() //number of datapoints
|
||||||
val m = 3 //number of features
|
val m = 1 //number of features
|
||||||
//var centres = new ArrayBuffer[Row]
|
//var centres = new ArrayBuffer[Row]
|
||||||
|
|
||||||
//get random number generator r and use to select K centres randomly from dataset
|
//get random number generator r and use to select K centres randomly from dataset
|
||||||
@@ -28,7 +29,8 @@ object KMeans {
|
|||||||
centres(a) = rows(r.ne
|
centres(a) = rows(r.ne
|
||||||
}*/
|
}*/
|
||||||
//val centres = rowsAsArray.takeSample(false, K, System.nanoTime().toInt)
|
//val centres = rowsAsArray.takeSample(false, K, System.nanoTime().toInt)
|
||||||
val centres : Array[List[Float]] = Array(List(0.0f, 0.0f, 0.0f), List(10.0f, 10.0f, 10.0f), List(20.0f, 20.0f, 20.0f))
|
//val centres : Array[List[Float]] = Array(List(0.0f, 0.0f, 0.0f), List(10.0f, 10.0f, 10.0f), List(20.0f, 20.0f, 20.0f))
|
||||||
|
val centres : Array[List[Float]] = Array(List(0.0f), List(0.0f), List(0.0f), List(0.0f), List(0.0f))
|
||||||
val clusterMap :RDD[(Int,List[Float])]= rowsAsArray.map(row => (assignCluster(row,centres,m,K),row))
|
val clusterMap :RDD[(Int,List[Float])]= rowsAsArray.map(row => (assignCluster(row,centres,m,K),row))
|
||||||
val newCentres = calculateNewCentres(clusterMap)
|
val newCentres = calculateNewCentres(clusterMap)
|
||||||
newCentres
|
newCentres
|
||||||
@@ -45,7 +47,7 @@ object KMeans {
|
|||||||
}
|
}
|
||||||
|
|
||||||
def assignCluster(row : List[Float], centres: Array[List[Float]], m : Int, K :Int): Int = {
|
def assignCluster(row : List[Float], centres: Array[List[Float]], m : Int, K :Int): Int = {
|
||||||
var smallestNorm = 99999999999.0
|
var smallestNorm = 999999.0
|
||||||
var closestCentre = 0
|
var closestCentre = 0
|
||||||
for (centreNumber <- 0 until K) {
|
for (centreNumber <- 0 until K) {
|
||||||
val norm = calculateNorm(row, centres(centreNumber), m)
|
val norm = calculateNorm(row, centres(centreNumber), m)
|
||||||
|
|||||||
@@ -32,8 +32,10 @@ object Main {
|
|||||||
val centresArray = centres.collect()
|
val centresArray = centres.collect()
|
||||||
val unwrap = centresArray.map(x => x._2)
|
val unwrap = centresArray.map(x => x._2)
|
||||||
unwrap.foreach(println)
|
unwrap.foreach(println)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/*val users = dataFrames("users")
|
//val users = dataFrames("users")
|
||||||
|
|
||||||
/*val dataFrames = DataParser.ParseData()
|
/*val dataFrames = DataParser.ParseData()
|
||||||
|
|
||||||
@@ -52,7 +54,3 @@ object Main {
|
|||||||
// Info on using DataFrames here: https://www.mapr.com/blog/using-apache-spark-dataframes-processing-tabular-data
|
// Info on using DataFrames here: https://www.mapr.com/blog/using-apache-spark-dataframes-processing-tabular-data
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|||||||
@@ -124,13 +124,13 @@ object XMLParser {
|
|||||||
// In this case, return a placeholder value of -1.
|
// In this case, return a placeholder value of -1.
|
||||||
case e: Exception => return -1
|
case e: Exception => return -1
|
||||||
}
|
}
|
||||||
case DateType =>
|
case DateType => return attribute
|
||||||
// If the string is a date, convert from date string to long.
|
/*// If the string is a date, convert from date string to long.
|
||||||
var format = new java.text.SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS")
|
var format = new java.text.SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS")
|
||||||
var longTime = format.parse(attribute).getTime()
|
var longTime = format.parse(attribute).getTime()
|
||||||
// Then convert long to int representing days since epoch
|
// Then convert long to int representing days since epoch
|
||||||
var longDays : Long = longTime / (1000*60*60*24)
|
var longDays : Long = longTime / (1000*60*60*24)
|
||||||
return longDays.toInt
|
return longDays.toInt*/
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user