Pre-master merge commit
This commit is contained in:
+1
-1
@@ -72,7 +72,7 @@ stuff the first time you run it...)
|
|||||||
To run the compiled application:
|
To run the compiled application:
|
||||||
|
|
||||||
cd target
|
cd target
|
||||||
spark-submit --class KMeans --master local KMeans-0.0.1.jar
|
spark-submit --class ClusterSOData.Main --master local KMeans-0.0.1.jar
|
||||||
|
|
||||||
That should run without errors, producing an output folder. Check that
|
That should run without errors, producing an output folder. Check that
|
||||||
something has been generated by running:
|
something has been generated by running:
|
||||||
|
|||||||
@@ -3,16 +3,17 @@ package ClusterSOData
|
|||||||
import org.apache.spark.SparkContext
|
import org.apache.spark.SparkContext
|
||||||
import org.apache.spark.SparkContext._
|
import org.apache.spark.SparkContext._
|
||||||
import org.apache.spark._
|
import org.apache.spark._
|
||||||
|
import org.apache.spark.sql._
|
||||||
|
|
||||||
object KMeans {
|
object KMeans {
|
||||||
/**
|
/**
|
||||||
* Run KMeans clustering on an input RDD vector
|
* Run KMeans clustering on an input RDD vector
|
||||||
*/
|
*/
|
||||||
def run(
|
def train(
|
||||||
//data: RDD[Vector]
|
//data: DataSet
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
// val counts = inputfile.flatMap(line => line.split(" ")).map(word => (word, 1)).reduceByKey(_+_);
|
//Normalise data using Euclidean normalisation
|
||||||
// counts.saveAsTextFile("output")
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -24,6 +24,8 @@ object Main {
|
|||||||
// for easy access to data elements.
|
// for easy access to data elements.
|
||||||
val df = DataParser.ParseData()
|
val df = DataParser.ParseData()
|
||||||
|
|
||||||
KMeans.run()
|
// get the users XML file
|
||||||
|
val users = df("users")
|
||||||
|
users.show()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -18,27 +18,22 @@ object DataParser {
|
|||||||
/*
|
/*
|
||||||
* Generate array of DataFrames from XML content
|
* Generate array of DataFrames from XML content
|
||||||
*/
|
*/
|
||||||
def ParseData() : Array[DataFrame] = {
|
def ParseData() : Map[String, DataFrame] = {
|
||||||
|
|
||||||
// Define XML file locations and a string of attribute tags to retrieve
|
// Define XML file locations and a string of attribute tags to retrieve
|
||||||
// from each xml element.
|
// from each xml element.
|
||||||
val xmlInfos = Array(
|
val xmlInfos = Array(
|
||||||
("../stackoverflow_dataset/badges.txt", "Id UserId Name Date"),
|
("badges", "../stackoverflow_dataset/badges.txt", "Id UserId Name Date"),
|
||||||
("../stackoverflow_dataset/comments.txt", "Id PostId Score Text CreationDate UserId"),
|
("comments", "../stackoverflow_dataset/comments.txt", "Id PostId Score Text CreationDate UserId"),
|
||||||
("../stackoverflow_dataset/posts.txt", "Id PostTypeId ParentID AcceptedAnswerId CreationDate Score ViewCount Body OwnerUserId LastEditorUserId LastEditorDisplayName LastEditDate LastActivityDate CommunityOwnedDate ClosedDate Title Tags AnswerCount CommentCount FavoriteCount"),
|
("posts", "../stackoverflow_dataset/posts.txt", "Id PostTypeId ParentID AcceptedAnswerId CreationDate Score ViewCount Body OwnerUserId LastEditorUserId LastEditorDisplayName LastEditDate LastActivityDate CommunityOwnedDate ClosedDate Title Tags AnswerCount CommentCount FavoriteCount"),
|
||||||
("../stackoverflow_dataset/postHistory.txt","Id PostHistoryTypeId PostId RevisionGUID CreationDate UserId UserDisplayName Comment Text CloseReasonId"),
|
("postHistory", "../stackoverflow_dataset/postHistory.txt","Id PostHistoryTypeId PostId RevisionGUID CreationDate UserId UserDisplayName Comment Text CloseReasonId"),
|
||||||
("../stackoverflow_dataset/postLinks.txt", "Id CreationDate PostId RelatedPostId PostLinkTypeId"),
|
("postLinks", "../stackoverflow_dataset/postLinks.txt", "Id CreationDate PostId RelatedPostId PostLinkTypeId"),
|
||||||
("../stackoverflow_dataset/users.txt", "Reputation CreationDate DisplayName EmailHash LastAccessDate WebsiteUrl Location Age AboutMe Views UpVotes DownVotes"),
|
("users", "../stackoverflow_dataset/users.txt", "Reputation CreationDate DisplayName EmailHash LastAccessDate WebsiteUrl Location Age AboutMe Views UpVotes DownVotes"),
|
||||||
("../stackoverflow_dataset/votes.txt", "Id PostId VoteTypeId UserId CreationDate")
|
("votes", "../stackoverflow_dataset/votes.txt", "Id PostId VoteTypeId UserId CreationDate")
|
||||||
)
|
)
|
||||||
|
|
||||||
// Store each file's DataFrame in an array of DataFrames.
|
// Store each file's DataFrame in an array of DataFrames.
|
||||||
val parsedData = xmlInfos.map(x => ParseXMLInfo(x))
|
val parsedData = xmlInfos.map(x => (x._1, ParseXMLInfo((x._2, x._3)))).toMap
|
||||||
|
|
||||||
// Display a subset of each DataFrame's data in a table
|
|
||||||
for(i <- 0 until parsedData.length){
|
|
||||||
parsedData(i).show()
|
|
||||||
}
|
|
||||||
|
|
||||||
return parsedData
|
return parsedData
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user