Pre-master merge commit

This commit is contained in:
Sam Perry
2016-12-14 14:41:25 +00:00
parent f491d78c41
commit b41d085184
4 changed files with 18 additions and 20 deletions
+1 -1
View File
@@ -72,7 +72,7 @@ stuff the first time you run it...)
To run the compiled application: To run the compiled application:
cd target cd target
spark-submit --class KMeans --master local KMeans-0.0.1.jar spark-submit --class ClusterSOData.Main --master local KMeans-0.0.1.jar
That should run without errors, producing an output folder. Check that That should run without errors, producing an output folder. Check that
something has been generated by running: something has been generated by running:
+5 -4
View File
@@ -3,16 +3,17 @@ package ClusterSOData
import org.apache.spark.SparkContext import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._ import org.apache.spark.SparkContext._
import org.apache.spark._ import org.apache.spark._
import org.apache.spark.sql._
object KMeans { object KMeans {
/** /**
* Run KMeans clustering on an input RDD vector * Run KMeans clustering on an input RDD vector
*/ */
def run( def train(
//data: RDD[Vector] //data: DataSet
) )
{ {
// val counts = inputfile.flatMap(line => line.split(" ")).map(word => (word, 1)).reduceByKey(_+_); //Normalise data using Euclidean normalisation
// counts.saveAsTextFile("output")
} }
} }
+3 -1
View File
@@ -24,6 +24,8 @@ object Main {
// for easy access to data elements. // for easy access to data elements.
val df = DataParser.ParseData() val df = DataParser.ParseData()
KMeans.run() // get the users XML file
val users = df("users")
users.show()
} }
} }
+9 -14
View File
@@ -18,27 +18,22 @@ object DataParser {
/* /*
* Generate array of DataFrames from XML content * Generate array of DataFrames from XML content
*/ */
def ParseData() : Array[DataFrame] = { def ParseData() : Map[String, DataFrame] = {
// Define XML file locations and a string of attribute tags to retrieve // Define XML file locations and a string of attribute tags to retrieve
// from each xml element. // from each xml element.
val xmlInfos = Array( val xmlInfos = Array(
("../stackoverflow_dataset/badges.txt", "Id UserId Name Date"), ("badges", "../stackoverflow_dataset/badges.txt", "Id UserId Name Date"),
("../stackoverflow_dataset/comments.txt", "Id PostId Score Text CreationDate UserId"), ("comments", "../stackoverflow_dataset/comments.txt", "Id PostId Score Text CreationDate UserId"),
("../stackoverflow_dataset/posts.txt", "Id PostTypeId ParentID AcceptedAnswerId CreationDate Score ViewCount Body OwnerUserId LastEditorUserId LastEditorDisplayName LastEditDate LastActivityDate CommunityOwnedDate ClosedDate Title Tags AnswerCount CommentCount FavoriteCount"), ("posts", "../stackoverflow_dataset/posts.txt", "Id PostTypeId ParentID AcceptedAnswerId CreationDate Score ViewCount Body OwnerUserId LastEditorUserId LastEditorDisplayName LastEditDate LastActivityDate CommunityOwnedDate ClosedDate Title Tags AnswerCount CommentCount FavoriteCount"),
("../stackoverflow_dataset/postHistory.txt","Id PostHistoryTypeId PostId RevisionGUID CreationDate UserId UserDisplayName Comment Text CloseReasonId"), ("postHistory", "../stackoverflow_dataset/postHistory.txt","Id PostHistoryTypeId PostId RevisionGUID CreationDate UserId UserDisplayName Comment Text CloseReasonId"),
("../stackoverflow_dataset/postLinks.txt", "Id CreationDate PostId RelatedPostId PostLinkTypeId"), ("postLinks", "../stackoverflow_dataset/postLinks.txt", "Id CreationDate PostId RelatedPostId PostLinkTypeId"),
("../stackoverflow_dataset/users.txt", "Reputation CreationDate DisplayName EmailHash LastAccessDate WebsiteUrl Location Age AboutMe Views UpVotes DownVotes"), ("users", "../stackoverflow_dataset/users.txt", "Reputation CreationDate DisplayName EmailHash LastAccessDate WebsiteUrl Location Age AboutMe Views UpVotes DownVotes"),
("../stackoverflow_dataset/votes.txt", "Id PostId VoteTypeId UserId CreationDate") ("votes", "../stackoverflow_dataset/votes.txt", "Id PostId VoteTypeId UserId CreationDate")
) )
// Store each file's DataFrame in an array of DataFrames. // Store each file's DataFrame in an array of DataFrames.
val parsedData = xmlInfos.map(x => ParseXMLInfo(x)) val parsedData = xmlInfos.map(x => (x._1, ParseXMLInfo((x._2, x._3)))).toMap
// Display a subset of each DataFrame's data in a table
for(i <- 0 until parsedData.length){
parsedData(i).show()
}
return parsedData return parsedData
} }