Pre-master merge commit

This commit is contained in:
Sam Perry
2016-12-14 14:41:25 +00:00
parent f491d78c41
commit b41d085184
4 changed files with 18 additions and 20 deletions
+1 -1
View File
@@ -72,7 +72,7 @@ stuff the first time you run it...)
To run the compiled application:
cd target
spark-submit --class KMeans --master local KMeans-0.0.1.jar
spark-submit --class ClusterSOData.Main --master local KMeans-0.0.1.jar
That should run without errors, producing an output folder. Check that
something has been generated by running:
+5 -4
View File
@@ -3,16 +3,17 @@ package ClusterSOData
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark._
import org.apache.spark.sql._
object KMeans {
/**
* Run KMeans clustering on an input RDD vector
*/
def run(
//data: RDD[Vector]
def train(
//data: DataSet
)
{
// val counts = inputfile.flatMap(line => line.split(" ")).map(word => (word, 1)).reduceByKey(_+_);
// counts.saveAsTextFile("output")
//Normalise data using Euclidean normalisation
}
}
+3 -1
View File
@@ -24,6 +24,8 @@ object Main {
// for easy access to data elements.
val df = DataParser.ParseData()
KMeans.run()
// get the users XML file
val users = df("users")
users.show()
}
}
+9 -14
View File
@@ -18,27 +18,22 @@ object DataParser {
/*
* Generate array of DataFrames from XML content
*/
def ParseData() : Array[DataFrame] = {
def ParseData() : Map[String, DataFrame] = {
// Define XML file locations and a string of attribute tags to retrieve
// from each xml element.
val xmlInfos = Array(
("../stackoverflow_dataset/badges.txt", "Id UserId Name Date"),
("../stackoverflow_dataset/comments.txt", "Id PostId Score Text CreationDate UserId"),
("../stackoverflow_dataset/posts.txt", "Id PostTypeId ParentID AcceptedAnswerId CreationDate Score ViewCount Body OwnerUserId LastEditorUserId LastEditorDisplayName LastEditDate LastActivityDate CommunityOwnedDate ClosedDate Title Tags AnswerCount CommentCount FavoriteCount"),
("../stackoverflow_dataset/postHistory.txt","Id PostHistoryTypeId PostId RevisionGUID CreationDate UserId UserDisplayName Comment Text CloseReasonId"),
("../stackoverflow_dataset/postLinks.txt", "Id CreationDate PostId RelatedPostId PostLinkTypeId"),
("../stackoverflow_dataset/users.txt", "Reputation CreationDate DisplayName EmailHash LastAccessDate WebsiteUrl Location Age AboutMe Views UpVotes DownVotes"),
("../stackoverflow_dataset/votes.txt", "Id PostId VoteTypeId UserId CreationDate")
("badges", "../stackoverflow_dataset/badges.txt", "Id UserId Name Date"),
("comments", "../stackoverflow_dataset/comments.txt", "Id PostId Score Text CreationDate UserId"),
("posts", "../stackoverflow_dataset/posts.txt", "Id PostTypeId ParentID AcceptedAnswerId CreationDate Score ViewCount Body OwnerUserId LastEditorUserId LastEditorDisplayName LastEditDate LastActivityDate CommunityOwnedDate ClosedDate Title Tags AnswerCount CommentCount FavoriteCount"),
("postHistory", "../stackoverflow_dataset/postHistory.txt","Id PostHistoryTypeId PostId RevisionGUID CreationDate UserId UserDisplayName Comment Text CloseReasonId"),
("postLinks", "../stackoverflow_dataset/postLinks.txt", "Id CreationDate PostId RelatedPostId PostLinkTypeId"),
("users", "../stackoverflow_dataset/users.txt", "Reputation CreationDate DisplayName EmailHash LastAccessDate WebsiteUrl Location Age AboutMe Views UpVotes DownVotes"),
("votes", "../stackoverflow_dataset/votes.txt", "Id PostId VoteTypeId UserId CreationDate")
)
// Store each file's DataFrame in an array of DataFrames.
val parsedData = xmlInfos.map(x => ParseXMLInfo(x))
// Display a subset of each DataFrame's data in a table
for(i <- 0 until parsedData.length){
parsedData(i).show()
}
val parsedData = xmlInfos.map(x => (x._1, ParseXMLInfo((x._2, x._3)))).toMap
return parsedData
}