diff --git a/Notes.mkdn b/Notes.mkdn index fc8ae60..54cfe00 100644 --- a/Notes.mkdn +++ b/Notes.mkdn @@ -72,7 +72,7 @@ stuff the first time you run it...) To run the compiled application: cd target - spark-submit --class KMeans --master local KMeans-0.0.1.jar + spark-submit --class ClusterSOData.Main --master local KMeans-0.0.1.jar That should run without errors, producing an output folder. Check that something has been generated by running: diff --git a/src/main/scala/KMeans.scala b/src/main/scala/KMeans.scala index 7da3587..a603e73 100644 --- a/src/main/scala/KMeans.scala +++ b/src/main/scala/KMeans.scala @@ -3,16 +3,17 @@ package ClusterSOData import org.apache.spark.SparkContext import org.apache.spark.SparkContext._ import org.apache.spark._ +import org.apache.spark.sql._ object KMeans { /** * Run KMeans clustering on an input RDD vector */ - def run( - //data: RDD[Vector] + def train( + //data: DataSet ) { - // val counts = inputfile.flatMap(line => line.split(" ")).map(word => (word, 1)).reduceByKey(_+_); - // counts.saveAsTextFile("output") + //Normalise data using Euclidean normalisation + } } diff --git a/src/main/scala/Main.scala b/src/main/scala/Main.scala index 9149bd0..8915c3e 100644 --- a/src/main/scala/Main.scala +++ b/src/main/scala/Main.scala @@ -24,6 +24,8 @@ object Main { // for easy access to data elements. val df = DataParser.ParseData() - KMeans.run() + // get the users XML file + val users = df("users") + users.show() } } diff --git a/src/main/scala/XMLParser.scala b/src/main/scala/XMLParser.scala index f386ebd..11c23ab 100644 --- a/src/main/scala/XMLParser.scala +++ b/src/main/scala/XMLParser.scala @@ -18,27 +18,22 @@ object DataParser { /* * Generate array of DataFrames from XML content */ - def ParseData() : Array[DataFrame] = { + def ParseData() : Map[String, DataFrame] = { // Define XML file locations and a string of attribute tags to retrieve // from each xml element. val xmlInfos = Array( - ("../stackoverflow_dataset/badges.txt", "Id UserId Name Date"), - ("../stackoverflow_dataset/comments.txt", "Id PostId Score Text CreationDate UserId"), - ("../stackoverflow_dataset/posts.txt", "Id PostTypeId ParentID AcceptedAnswerId CreationDate Score ViewCount Body OwnerUserId LastEditorUserId LastEditorDisplayName LastEditDate LastActivityDate CommunityOwnedDate ClosedDate Title Tags AnswerCount CommentCount FavoriteCount"), - ("../stackoverflow_dataset/postHistory.txt","Id PostHistoryTypeId PostId RevisionGUID CreationDate UserId UserDisplayName Comment Text CloseReasonId"), - ("../stackoverflow_dataset/postLinks.txt", "Id CreationDate PostId RelatedPostId PostLinkTypeId"), - ("../stackoverflow_dataset/users.txt", "Reputation CreationDate DisplayName EmailHash LastAccessDate WebsiteUrl Location Age AboutMe Views UpVotes DownVotes"), - ("../stackoverflow_dataset/votes.txt", "Id PostId VoteTypeId UserId CreationDate") + ("badges", "../stackoverflow_dataset/badges.txt", "Id UserId Name Date"), + ("comments", "../stackoverflow_dataset/comments.txt", "Id PostId Score Text CreationDate UserId"), + ("posts", "../stackoverflow_dataset/posts.txt", "Id PostTypeId ParentID AcceptedAnswerId CreationDate Score ViewCount Body OwnerUserId LastEditorUserId LastEditorDisplayName LastEditDate LastActivityDate CommunityOwnedDate ClosedDate Title Tags AnswerCount CommentCount FavoriteCount"), + ("postHistory", "../stackoverflow_dataset/postHistory.txt","Id PostHistoryTypeId PostId RevisionGUID CreationDate UserId UserDisplayName Comment Text CloseReasonId"), + ("postLinks", "../stackoverflow_dataset/postLinks.txt", "Id CreationDate PostId RelatedPostId PostLinkTypeId"), + ("users", "../stackoverflow_dataset/users.txt", "Reputation CreationDate DisplayName EmailHash LastAccessDate WebsiteUrl Location Age AboutMe Views UpVotes DownVotes"), + ("votes", "../stackoverflow_dataset/votes.txt", "Id PostId VoteTypeId UserId CreationDate") ) // Store each file's DataFrame in an array of DataFrames. - val parsedData = xmlInfos.map(x => ParseXMLInfo(x)) - - // Display a subset of each DataFrame's data in a table - for(i <- 0 until parsedData.length){ - parsedData(i).show() - } + val parsedData = xmlInfos.map(x => (x._1, ParseXMLInfo((x._2, x._3)))).toMap return parsedData }