Pre-master merge commit

2016-12-14 14:41:25 +00:00
parent f491d78c41
commit b41d085184
4 changed files with 18 additions and 20 deletions
@@ -72,7 +72,7 @@ stuff the first time you run it...)
 To run the compiled application:
    cd target
-    spark-submit --class KMeans --master local KMeans-0.0.1.jar
+    spark-submit --class ClusterSOData.Main --master local KMeans-0.0.1.jar
 That should run without errors, producing an output folder. Check that
 something has been generated by running:
@@ -3,16 +3,17 @@ package ClusterSOData
 import org.apache.spark.SparkContext 
 import org.apache.spark.SparkContext._ 
 import org.apache.spark._
 import org.apache.spark.sql._
 object KMeans {
   /**
    * Run KMeans clustering on an input RDD vector
   */
-  def run(
+  def train(
-    //data: RDD[Vector]
+    //data: DataSet
  ) 
  {
-    // val counts = inputfile.flatMap(line => line.split(" ")).map(word => (word, 1)).reduceByKey(_+_);
+    //Normalise data using Euclidean normalisation
-    // counts.saveAsTextFile("output")
+
  }
 }
@@ -24,6 +24,8 @@ object Main {
    // for easy access to data elements.
    val df = DataParser.ParseData()
-    KMeans.run()
+    // get the users XML file
    val users = df("users")
    users.show()
  }
 }
@@ -18,27 +18,22 @@ object DataParser {
  /*
   * Generate array of DataFrames from XML content
   */
-  def ParseData() : Array[DataFrame] = {
+  def ParseData() : Map[String, DataFrame] = {
    // Define XML file locations and a string of attribute tags to retrieve
    // from each xml element.
    val xmlInfos = Array(
-      ("../stackoverflow_dataset/badges.txt", "Id UserId Name Date"),
+      ("badges", "../stackoverflow_dataset/badges.txt", "Id UserId Name Date"),
-      ("../stackoverflow_dataset/comments.txt", "Id PostId Score Text CreationDate UserId"),
+      ("comments", "../stackoverflow_dataset/comments.txt", "Id PostId Score Text CreationDate UserId"),
-      ("../stackoverflow_dataset/posts.txt", "Id PostTypeId ParentID AcceptedAnswerId CreationDate Score ViewCount Body OwnerUserId LastEditorUserId LastEditorDisplayName LastEditDate LastActivityDate CommunityOwnedDate ClosedDate Title Tags AnswerCount CommentCount FavoriteCount"),
+      ("posts", "../stackoverflow_dataset/posts.txt", "Id PostTypeId ParentID AcceptedAnswerId CreationDate Score ViewCount Body OwnerUserId LastEditorUserId LastEditorDisplayName LastEditDate LastActivityDate CommunityOwnedDate ClosedDate Title Tags AnswerCount CommentCount FavoriteCount"),
-      ("../stackoverflow_dataset/postHistory.txt","Id PostHistoryTypeId PostId RevisionGUID CreationDate UserId UserDisplayName Comment Text CloseReasonId"),
+      ("postHistory", "../stackoverflow_dataset/postHistory.txt","Id PostHistoryTypeId PostId RevisionGUID CreationDate UserId UserDisplayName Comment Text CloseReasonId"),
-      ("../stackoverflow_dataset/postLinks.txt", "Id CreationDate PostId RelatedPostId PostLinkTypeId"),
+      ("postLinks", "../stackoverflow_dataset/postLinks.txt", "Id CreationDate PostId RelatedPostId PostLinkTypeId"),
-      ("../stackoverflow_dataset/users.txt", "Reputation CreationDate DisplayName EmailHash LastAccessDate WebsiteUrl Location Age AboutMe Views UpVotes DownVotes"),
+      ("users", "../stackoverflow_dataset/users.txt", "Reputation CreationDate DisplayName EmailHash LastAccessDate WebsiteUrl Location Age AboutMe Views UpVotes DownVotes"),
-      ("../stackoverflow_dataset/votes.txt", "Id PostId VoteTypeId UserId CreationDate")
+      ("votes", "../stackoverflow_dataset/votes.txt", "Id PostId VoteTypeId UserId CreationDate")
    )
    // Store each file's DataFrame in an array of DataFrames.
-    val parsedData = xmlInfos.map(x => ParseXMLInfo(x))
+    val parsedData = xmlInfos.map(x => (x._1, ParseXMLInfo((x._2, x._3)))).toMap
    // Display a subset of each DataFrame's data in a table
    for(i <- 0 until parsedData.length){
      parsedData(i).show()
    }
    return parsedData
  }