Merge branch 'sam-dev' into joe-dev2

This commit is contained in:
Joe Darby
2016-12-15 21:39:44 +00:00
3 changed files with 80 additions and 19 deletions
Executable
+4
View File
@@ -0,0 +1,4 @@
#!/usr/bin/env bash
cd target
spark-submit --class ClusterSOData.Main --master local KMeans-0.0.1.jar
+18
View File
@@ -22,6 +22,7 @@ object Main {
def main(args: Array[String]) {
// Retrieve data from StackOverflow dataset XMLs. Format into DataFrames
// for easy access to data elements.
val df = XMLParser.ParseData()
// get the users XML file
@@ -30,5 +31,22 @@ object Main {
val centresArray = centres.collect()
val unwrap = centresArray.map(x => x._2)
unwrap.foreach(println)
/*val dataFrames = DataParser.ParseData()
// get the users XML file
val users = dataFrames("users")
users.persist()
// Show 20 entries from the user dataset
users.show()
// Show types for the user dataset
users.printSchema()
users.show()
// create new dataframe with only the reputation of the users
users.select("CreationDate").show()
*/
// Info on using DataFrames here: https://www.mapr.com/blog/using-apache-spark-dataframes-processing-tabular-data
}
}
+58 -19
View File
@@ -23,28 +23,33 @@ object XMLParser {
// Define XML file locations and a string of attribute tags to retrieve
// from each xml element.
val xmlInfos = Array(
("badges", "../stackoverflow_dataset/badges.txt", "Id UserId Name Date"),
("comments", "../stackoverflow_dataset/comments.txt", "Id PostId Score Text CreationDate UserId"),
("posts", "../stackoverflow_dataset/posts.txt", "Id PostTypeId ParentID AcceptedAnswerId CreationDate Score ViewCount Body OwnerUserId LastEditorUserId LastEditorDisplayName LastEditDate LastActivityDate CommunityOwnedDate ClosedDate Title Tags AnswerCount CommentCount FavoriteCount"),
("postHistory", "../stackoverflow_dataset/postHistory.txt","Id PostHistoryTypeId PostId RevisionGUID CreationDate UserId UserDisplayName Comment Text CloseReasonId"),
("postLinks", "../stackoverflow_dataset/postLinks.txt", "Id CreationDate PostId RelatedPostId PostLinkTypeId"),
("users", "../stackoverflow_dataset/users.txt", "Reputation CreationDate DisplayName EmailHash LastAccessDate WebsiteUrl Location Age AboutMe Views UpVotes DownVotes"),
("votes", "../stackoverflow_dataset/votes.txt", "Id PostId VoteTypeId UserId CreationDate")
)
/*
("badges", "/data/stackoverflow/Badges", "Id UserId Name Date", Array[DataType](IntegerType, IntegerType, StringType, DateType)),
("comments", "/data/stackoverflow/Comments", "Id PostId Score Text CreationDate UserId", Array[DataType](IntegerType, IntegerType, IntegerType, StringType, DateType, IntegerType)),
("posts", "data/stackoverflow/Posts", "Id PostTypeId ParentID AcceptedAnswerId CreationDate Score ViewCount Body OwnerUserId LastEditorUserId LastEditorDisplayName LastEditDate LastActivityDate CommunityOwnedDate ClosedDate Title Tags AnswerCount CommentCount FavoriteCount", Array[DataType](IntegerType, IntegerType, IntegerType, IntegerType, DateType, IntegerType, IntegerType, StringType, IntegerType, IntegerType, StringType, DateType, DateType, DateType, DateType, StringType, StringType, IntegerType, IntegerType, IntegerType)),
("postHistory", "/data/stackoverflow/PostHistory","Id PostHistoryTypeId PostId RevisionGUID CreationDate UserId UserDisplayName Comment Text CloseReasonId", Array[DataType](IntegerType, IntegerType, IntegerType,IntegerType, DateType, IntegerType, StringType, StringType, StringType, IntegerType)),
("postLinks", "data/stackoverflow/PostLinks", "Id CreationDate PostId RelatedPostId PostLinkTypeId", Array[DataType](IntegerType, DateType, IntegerType, IntegerType, IntegerType)),
*/
("users", "/data/stackoverflow/Users", "Reputation CreationDate DisplayName EmailHash LastAccessDate WebsiteUrl Location Age AboutMe Views UpVotes DownVotes", Array[DataType](IntegerType, DateType, StringType, StringType, DateType, StringType, StringType, IntegerType, StringType, IntegerType, IntegerType, IntegerType))
/*
("votes", "/data/stackoverflow/Votes", "Id PostId VoteTypeId UserId CreationDate", Array[DataType](IntegerType, IntegerType, IntegerType, IntegerType, DateType))
*/
)
// Store each file's DataFrame in an array of DataFrames.
val parsedData = xmlInfos.map(x => (x._1, ParseXMLInfo((x._2, x._3)))).toMap
val parsedData = xmlInfos.map(x => (x._1, ParseXMLInfo((x._2, x._3, x._4)))).toMap
return parsedData
}
private def ParseXMLInfo(xmlInfo: (String, String)) : DataFrame = {
private def ParseXMLInfo(xmlInfo: (String, String, Array[DataType])) : DataFrame = {
// Get the XML attributes used for generating the table columns
var schemaString = xmlInfo._2
val schemaType = xmlInfo._3
// Generate schema using XML attribute string
var schema = GenerateSchemaFromString(schemaString)
var schema = GenerateSchemaFromString(schemaString, schemaType)
// Generate RDD of data from the XML file
var rdd = ParseInput(xmlInfo._1, schemaString)
var rdd = ParseInput(xmlInfo._1, schemaString, schemaType)
// Convert RDD to DataFrame for easier processing
var data = Main.sqlContext.createDataFrame(rdd, schema)
@@ -55,9 +60,13 @@ object XMLParser {
/*
* Generate a schema based on the string of XML attributes
*/
private def GenerateSchemaFromString(schemaString: String) : StructType = {
val fields = schemaString.split(" ")
.map(fieldName => StructField(fieldName, StringType, nullable = true))
private def GenerateSchemaFromString(schemaString: String, schemaType: Array[DataType]) : StructType = {
// Replace all DateTypes with Longs as date will now be stored as longs.
val sT = schemaType.map(i => if (i==DateType) LongType else i)
val schemaPairs = schemaString.split(" ") zip sT
// Create schema for columns and set their datatypes for DataFrame based on attribute names.
val fields = schemaPairs.map{case (fieldName: String, dataType: DataType) => StructField(fieldName, dataType, nullable = true)}
val schema = StructType(fields)
return schema
}
@@ -68,12 +77,14 @@ object XMLParser {
* inputFilepath: Filepath to XML file
* schemaString: Space seperated attribute values
*/
private def ParseInput(inputFilepath: String, schemaString: String) : RDD[Row] = {
private def ParseInput(inputFilepath: String, schemaString: String, schemaType: Array[DataType]) : RDD[Row] = {
// Create spark text file object
val inputFile = Main.sc.textFile(inputFilepath)
// Map the input file data to an RDD
val Data = inputFile.map(line => ParsingFunc(line, schemaString))
val Data = inputFile.collect {
case line if !SantizeLine(line) => ParsingFunc(line, schemaString, schemaType)
}
return Data
}
@@ -83,15 +94,43 @@ object XMLParser {
* line: XML file line
* schemaString: Space seperated attribute values
*/
private def ParsingFunc(line: String, schemaString: String) : Row = {
private def SantizeLine(line: String) : Boolean = {
val invalidLines = Array("<?xml version=\"1.0\" encoding=\"utf-8\"?>", "<users>", "</users>")
return invalidLines contains line
}
private def ParsingFunc(line: String, schemaString: String, schemaType: Array[DataType]) : Row = {
// Parse line of XML using Scala's built in XML library
val xmlLine = scala.xml.XML.loadString(line)
var schemaPairs = schemaString.split(" ") zip schemaType
// Create array of values with element for each attribute in schemaString
var lineData = schemaString.split(" ").map(fieldName => getXMLAttribute(xmlLine, fieldName))
var lineData = schemaPairs.map { case (fieldName: String, dType: DataType) => castToDType(getXMLAttribute(xmlLine, fieldName), dType) }
return Row.fromSeq(lineData)
}
/*
* Cast attribute data to relevant datatype.
*/
private def castToDType(attribute: String, dType: DataType) : Any = {
dType match {
case StringType => return attribute
case IntegerType =>
try {
return attribute.toInt
} catch {
// If the string was not castable to integer then it is not a number.
// In this case, return a placeholder value of -1.
case e: Exception => return -1
}
case DateType =>
// If the string is a date, convert from date string to long.
var format = new java.text.SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS")
return format.parse(attribute).getTime()
}
}
/*
* Handle NullPointerError raised when an attribute doesn't exist
*