Compare commits
11 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 3d648b26f7 | |||
| cbb470dd73 | |||
| d6262b84ca | |||
| 81b6e47e26 | |||
| abc8437620 | |||
| f63e2cd4b2 | |||
| c9c718dbe8 | |||
| f4e555ab9a | |||
| 8ef2828723 | |||
| 32e774819e | |||
| b41d085184 |
Binary file not shown.
+1
-1
@@ -72,7 +72,7 @@ stuff the first time you run it...)
|
||||
To run the compiled application:
|
||||
|
||||
cd target
|
||||
spark-submit --class KMeans --master local KMeans-0.0.1.jar
|
||||
spark-submit --class ClusterSOData.Main --master local KMeans-0.0.1.jar
|
||||
|
||||
That should run without errors, producing an output folder. Check that
|
||||
something has been generated by running:
|
||||
|
||||
Executable
+4
@@ -0,0 +1,4 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
cd target
|
||||
spark-submit --class ClusterSOData.Main --master local KMeans-0.0.1.jar
|
||||
@@ -3,16 +3,17 @@ package ClusterSOData
|
||||
import org.apache.spark.SparkContext
|
||||
import org.apache.spark.SparkContext._
|
||||
import org.apache.spark._
|
||||
import org.apache.spark.sql._
|
||||
|
||||
object KMeans {
|
||||
/**
|
||||
* Run KMeans clustering on an input RDD vector
|
||||
*/
|
||||
def run(
|
||||
//data: RDD[Vector]
|
||||
def train(
|
||||
//data: DataSet
|
||||
)
|
||||
{
|
||||
// val counts = inputfile.flatMap(line => line.split(" ")).map(word => (word, 1)).reduceByKey(_+_);
|
||||
// counts.saveAsTextFile("output")
|
||||
//Normalise data using Euclidean normalisation
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -22,8 +22,21 @@ object Main {
|
||||
def main(args: Array[String]) {
|
||||
// Retrieve data from StackOverflow dataset XMLs. Format into DataFrames
|
||||
// for easy access to data elements.
|
||||
val df = DataParser.ParseData()
|
||||
val dataFrames = DataParser.ParseData()
|
||||
|
||||
KMeans.run()
|
||||
// get the users XML file
|
||||
val users = dataFrames("users")
|
||||
users.persist()
|
||||
// Show 20 entries from the user dataset
|
||||
users.show()
|
||||
// Show types for the user dataset
|
||||
users.printSchema()
|
||||
users.show()
|
||||
|
||||
// create new dataframe with only the reputation of the users
|
||||
users.select("CreationDate").show()
|
||||
|
||||
// Info on using DataFrames here: https://www.mapr.com/blog/using-apache-spark-dataframes-processing-tabular-data
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -9,6 +9,18 @@ import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.sql._
|
||||
import org.apache.spark.sql.types._
|
||||
|
||||
import scala.xml.Elem
|
||||
import scala.xml.factory.XMLLoader
|
||||
import javax.xml.parsers.SAXParser
|
||||
object MyXML extends XMLLoader[Elem] {
|
||||
override def parser: SAXParser = {
|
||||
val f = javax.xml.parsers.SAXParserFactory.newInstance()
|
||||
f.setNamespaceAware(false)
|
||||
f.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true);
|
||||
f.newSAXParser()
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Format and parse XML data to datasets, ready for further processing using
|
||||
* spark
|
||||
@@ -18,38 +30,38 @@ object DataParser {
|
||||
/*
|
||||
* Generate array of DataFrames from XML content
|
||||
*/
|
||||
def ParseData() : Array[DataFrame] = {
|
||||
def ParseData() : Map[String, DataFrame] = {
|
||||
|
||||
// Define XML file locations and a string of attribute tags to retrieve
|
||||
// from each xml element.
|
||||
val xmlInfos = Array(
|
||||
("../stackoverflow_dataset/badges.txt", "Id UserId Name Date"),
|
||||
("../stackoverflow_dataset/comments.txt", "Id PostId Score Text CreationDate UserId"),
|
||||
("../stackoverflow_dataset/posts.txt", "Id PostTypeId ParentID AcceptedAnswerId CreationDate Score ViewCount Body OwnerUserId LastEditorUserId LastEditorDisplayName LastEditDate LastActivityDate CommunityOwnedDate ClosedDate Title Tags AnswerCount CommentCount FavoriteCount"),
|
||||
("../stackoverflow_dataset/postHistory.txt","Id PostHistoryTypeId PostId RevisionGUID CreationDate UserId UserDisplayName Comment Text CloseReasonId"),
|
||||
("../stackoverflow_dataset/postLinks.txt", "Id CreationDate PostId RelatedPostId PostLinkTypeId"),
|
||||
("../stackoverflow_dataset/users.txt", "Reputation CreationDate DisplayName EmailHash LastAccessDate WebsiteUrl Location Age AboutMe Views UpVotes DownVotes"),
|
||||
("../stackoverflow_dataset/votes.txt", "Id PostId VoteTypeId UserId CreationDate")
|
||||
)
|
||||
/*
|
||||
("badges", "/data/stackoverflow/Badges", "Id UserId Name Date", Array[DataType](IntegerType, IntegerType, StringType, DateType)),
|
||||
("comments", "/data/stackoverflow/Comments", "Id PostId Score Text CreationDate UserId", Array[DataType](IntegerType, IntegerType, IntegerType, StringType, DateType, IntegerType)),
|
||||
("posts", "data/stackoverflow/Posts", "Id PostTypeId ParentID AcceptedAnswerId CreationDate Score ViewCount Body OwnerUserId LastEditorUserId LastEditorDisplayName LastEditDate LastActivityDate CommunityOwnedDate ClosedDate Title Tags AnswerCount CommentCount FavoriteCount", Array[DataType](IntegerType, IntegerType, IntegerType, IntegerType, DateType, IntegerType, IntegerType, StringType, IntegerType, IntegerType, StringType, DateType, DateType, DateType, DateType, StringType, StringType, IntegerType, IntegerType, IntegerType)),
|
||||
("postHistory", "/data/stackoverflow/PostHistory","Id PostHistoryTypeId PostId RevisionGUID CreationDate UserId UserDisplayName Comment Text CloseReasonId", Array[DataType](IntegerType, IntegerType, IntegerType,IntegerType, DateType, IntegerType, StringType, StringType, StringType, IntegerType)),
|
||||
("postLinks", "data/stackoverflow/PostLinks", "Id CreationDate PostId RelatedPostId PostLinkTypeId", Array[DataType](IntegerType, DateType, IntegerType, IntegerType, IntegerType)),
|
||||
*/
|
||||
("users", "/Users/Work/o/Big_Data_Assignment_2/stackoverflow_dataset/users.txt", "Reputation CreationDate DisplayName EmailHash LastAccessDate WebsiteUrl Location Age AboutMe Views UpVotes DownVotes", Array[DataType](IntegerType, DateType, StringType, StringType, DateType, StringType, StringType, IntegerType, StringType, IntegerType, IntegerType, IntegerType))
|
||||
/*
|
||||
("votes", "/data/stackoverflow/Votes", "Id PostId VoteTypeId UserId CreationDate", Array[DataType](IntegerType, IntegerType, IntegerType, IntegerType, DateType))
|
||||
*/
|
||||
)
|
||||
|
||||
// Store each file's DataFrame in an array of DataFrames.
|
||||
val parsedData = xmlInfos.map(x => ParseXMLInfo(x))
|
||||
|
||||
// Display a subset of each DataFrame's data in a table
|
||||
for(i <- 0 until parsedData.length){
|
||||
parsedData(i).show()
|
||||
}
|
||||
val parsedData = xmlInfos.map(x => (x._1, ParseXMLInfo((x._2, x._3, x._4)))).toMap
|
||||
|
||||
return parsedData
|
||||
}
|
||||
|
||||
private def ParseXMLInfo(xmlInfo: (String, String)) : DataFrame = {
|
||||
private def ParseXMLInfo(xmlInfo: (String, String, Array[DataType])) : DataFrame = {
|
||||
// Get the XML attributes used for generating the table columns
|
||||
var schemaString = xmlInfo._2
|
||||
val schemaType = xmlInfo._3
|
||||
// Generate schema using XML attribute string
|
||||
var schema = GenerateSchemaFromString(schemaString)
|
||||
var schema = GenerateSchemaFromString(schemaString, schemaType)
|
||||
// Generate RDD of data from the XML file
|
||||
var rdd = ParseInput(xmlInfo._1, schemaString)
|
||||
var rdd = ParseInput(xmlInfo._1, schemaString, schemaType)
|
||||
// Convert RDD to DataFrame for easier processing
|
||||
var data = Main.sqlContext.createDataFrame(rdd, schema)
|
||||
|
||||
@@ -60,9 +72,13 @@ object DataParser {
|
||||
/*
|
||||
* Generate a schema based on the string of XML attributes
|
||||
*/
|
||||
private def GenerateSchemaFromString(schemaString: String) : StructType = {
|
||||
val fields = schemaString.split(" ")
|
||||
.map(fieldName => StructField(fieldName, StringType, nullable = true))
|
||||
private def GenerateSchemaFromString(schemaString: String, schemaType: Array[DataType]) : StructType = {
|
||||
// Replace all DateTypes with Longs as date will now be stored as longs.
|
||||
val sT = schemaType.map(i => if (i==DateType) LongType else i)
|
||||
val schemaPairs = schemaString.split(" ") zip sT
|
||||
|
||||
// Create schema for columns and set their datatypes for DataFrame based on attribute names.
|
||||
val fields = schemaPairs.map{case (fieldName: String, dataType: DataType) => StructField(fieldName, dataType, nullable = true)}
|
||||
val schema = StructType(fields)
|
||||
return schema
|
||||
}
|
||||
@@ -73,12 +89,14 @@ object DataParser {
|
||||
* inputFilepath: Filepath to XML file
|
||||
* schemaString: Space seperated attribute values
|
||||
*/
|
||||
private def ParseInput(inputFilepath: String, schemaString: String) : RDD[Row] = {
|
||||
private def ParseInput(inputFilepath: String, schemaString: String, schemaType: Array[DataType]) : RDD[Row] = {
|
||||
// Create spark text file object
|
||||
val inputFile = Main.sc.textFile(inputFilepath)
|
||||
|
||||
// Map the input file data to an RDD
|
||||
val Data = inputFile.map(line => ParsingFunc(line, schemaString))
|
||||
val Data = inputFile.collect {
|
||||
case line if !SantizeLine(line) => ParsingFunc(line, schemaString, schemaType)
|
||||
}
|
||||
return Data
|
||||
}
|
||||
|
||||
@@ -88,15 +106,43 @@ object DataParser {
|
||||
* line: XML file line
|
||||
* schemaString: Space seperated attribute values
|
||||
*/
|
||||
private def ParsingFunc(line: String, schemaString: String) : Row = {
|
||||
|
||||
private def SantizeLine(line: String) : Boolean = {
|
||||
val invalidLines = Array("<?xml version=\"1.0\" encoding=\"utf-8\"?>", "<users>", "</users>")
|
||||
return invalidLines contains line
|
||||
}
|
||||
|
||||
private def ParsingFunc(line: String, schemaString: String, schemaType: Array[DataType]) : Row = {
|
||||
// Parse line of XML using Scala's built in XML library
|
||||
val xmlLine = scala.xml.XML.loadString(line)
|
||||
val xmlLine = MyXML.loadString(line)
|
||||
var schemaPairs = schemaString.split(" ") zip schemaType
|
||||
// Create array of values with element for each attribute in schemaString
|
||||
var lineData = schemaString.split(" ").map(fieldName => getXMLAttribute(xmlLine, fieldName))
|
||||
var lineData = schemaPairs.map { case (fieldName: String, dType: DataType) => castToDType(getXMLAttribute(xmlLine, fieldName), dType) }
|
||||
|
||||
return Row.fromSeq(lineData)
|
||||
}
|
||||
|
||||
/*
|
||||
* Cast attribute data to relevant datatype.
|
||||
*/
|
||||
private def castToDType(attribute: String, dType: DataType) : Any = {
|
||||
dType match {
|
||||
case StringType => return attribute
|
||||
case IntegerType =>
|
||||
try {
|
||||
return attribute.toInt
|
||||
} catch {
|
||||
// If the string was not castable to integer then it is not a number.
|
||||
// In this case, return a placeholder value of -1.
|
||||
case e: Exception => return -1
|
||||
}
|
||||
case DateType =>
|
||||
// If the string is a date, convert from date string to long.
|
||||
var format = new java.text.SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS")
|
||||
return format.parse(attribute).getTime()
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Handle NullPointerError raised when an attribute doesn't exist
|
||||
*
|
||||
|
||||
Reference in New Issue
Block a user