working feature parsing
This commit is contained in:
@@ -81,17 +81,12 @@
|
||||
<dependency>
|
||||
<groupId>org.scala-lang</groupId>
|
||||
<artifactId>scala-library</artifactId>
|
||||
<version>2.11.8</version>
|
||||
<version>2.10.4</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-core_2.11</artifactId>
|
||||
<version>2.0.2</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-sql_2.11</artifactId>
|
||||
<version>2.0.2</version>
|
||||
<artifactId>spark-core_2.10</artifactId>
|
||||
<version>1.0.0-cdh5.1.0</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</project>
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
package bdp.spark.KMeans
|
||||
|
||||
import scala.collection.mutable.ListBuffer
|
||||
|
||||
import org.apache.spark.SparkContext
|
||||
import org.apache.spark.SparkContext._
|
||||
import org.apache.spark.SparkConf
|
||||
@@ -7,28 +9,40 @@ import org.apache.spark.SparkConf
|
||||
object SparkKMeans {
|
||||
def main(args: Array[String]) {
|
||||
val sc = new SparkContext(new SparkConf().setAppName("Spark KMeans"))
|
||||
|
||||
val lines = sc.textFile(args(0))
|
||||
val featureSet = lines.map(getFeatures)
|
||||
|
||||
featureSet.foreach(println)
|
||||
val printableFeatureSet = featureSet.map(makeListPrintable)
|
||||
printableFeatureSet.foreach{println}
|
||||
|
||||
}
|
||||
|
||||
def getFeatures(line :String) : [String] = {
|
||||
|
||||
val featureIDs = [" rowID="," Reputation="," CreationDate="," LastAccessDate="," Views="," UpVotes="," DownVotes="," Age="]
|
||||
def getFeatures(line :String) : List[String] = {
|
||||
|
||||
val fragments = line.split("\"")
|
||||
var features = []
|
||||
|
||||
for (a <- 0 to 7) {
|
||||
if (fragments.contains(featureIDs(a))) {
|
||||
val index = fragments.indexOf(featureIDs(a))
|
||||
features(a) = fragments(index + 1)
|
||||
|
||||
val featureIDs = List(" Reputation="," CreationDate="," LastAccessDate="," Views="," UpVotes="," DownVotes="," Age=")
|
||||
|
||||
var features = new ListBuffer[String]()
|
||||
features += fragments(1)
|
||||
|
||||
var a = ""
|
||||
for (a <- featureIDs) {
|
||||
if (fragments.contains(a)) {
|
||||
val index = fragments.indexOf(a)
|
||||
features += fragments(index + 1)
|
||||
} else {
|
||||
features(a) = ""
|
||||
features += ""
|
||||
}
|
||||
}
|
||||
return features
|
||||
|
||||
features(2) = features(2).substring(0,10)
|
||||
features(3) = features(3).substring(0,10)
|
||||
|
||||
val featuresList = features.toList
|
||||
return featuresList
|
||||
}
|
||||
|
||||
def makeListPrintable(featureList : List[String]) : String = {
|
||||
return featureList.mkString(", ")
|
||||
}
|
||||
}
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -1,5 +1,5 @@
|
||||
#Generated by Maven
|
||||
#Tue Dec 13 19:39:54 GMT 2016
|
||||
#Wed Dec 14 00:04:22 GMT 2016
|
||||
version=0.0.1
|
||||
groupId=uk.ac.qmul.spark
|
||||
artifactId=KMeans
|
||||
|
||||
Reference in New Issue
Block a user