Hack not working yet...

This commit is contained in:
Sam Perry
2016-12-19 11:21:16 +00:00
parent 100aab773a
commit 86b512234a
3 changed files with 18 additions and 6 deletions
+5
View File
@@ -93,5 +93,10 @@
<artifactId>spark-sql_2.11</artifactId>
<version>2.0.2</version>
</dependency>
<dependency>
<groupId>com.databricks</groupId>
<artifactId>spark-xml_2.11</artifactId>
<version>0.4.0</version>
</dependency>
</dependencies>
</project>
+1 -1
View File
@@ -1,4 +1,4 @@
#!/usr/bin/env bash
cd target
spark-submit --class ClusterSOData.Main --master local KMeans-0.0.1.jar
spark-submit --packages com.databricks:spark-xml_2.11:0.4.0 --class ClusterSOData.Main --master local KMeans-0.0.1.jar
+12 -5
View File
@@ -8,6 +8,7 @@ import org.apache.spark.rdd.RDD
import org.apache.spark.sql._
import org.apache.spark.sql.types._
import com.databricks.spark.xml.XmlReader;
/*
* Run KMeans clustering on the StackOverflow dataset
*/
@@ -27,10 +28,16 @@ object Main {
def main(args: Array[String]) {
// Retrieve data from StackOverflow dataset XMLs. Format into DataFrames
// for easy access to data elements.
val dataFrames = DataParser.ParseData()
val a = dataFrames("users")
a.persist()
//val dataFrames = DataParser.ParseData()
val customSchema = StructType(Array(
StructField("_Reputation", StringType, nullable = true)))
val localxml="../stackoverflow_dataset/users.txt";
val booksFileTag = "row";
val df = sqlContext.read
.format("com.databricks.spark.xml")
.load(localxml)
.schema(customSchema)
df.printSchema();
// get the users XML file
//val users = dataFrames("users")
//users.persist()
@@ -44,7 +51,7 @@ object Main {
// create new dataframe with only the reputation of the users
//val a = users.select("Reputation").rdd.map(r => r(0)).persist()
a.take(2).foreach(f => println(f(1)))