5 Commits

Author SHA1 Message Date
Paul Campbell 3d648b26f7 first draft of report 2016-12-17 20:12:21 +00:00
Paul Campbell cbb470dd73 Improved XML Parser 2016-12-16 13:58:09 +00:00
Paul Campbell d6262b84ca Merge branch 'sam-dev' into paul-dev 2016-12-16 13:44:05 +00:00
Paul Campbell 81b6e47e26 FUCK 2016-12-16 13:42:24 +00:00
Paul Campbell abc8437620 Improved XML Parser 2016-12-16 13:36:02 +00:00
2 changed files with 14 additions and 2 deletions
Binary file not shown.
+14 -2
View File
@@ -9,6 +9,18 @@ import org.apache.spark.rdd.RDD
import org.apache.spark.sql._
import org.apache.spark.sql.types._
import scala.xml.Elem
import scala.xml.factory.XMLLoader
import javax.xml.parsers.SAXParser
object MyXML extends XMLLoader[Elem] {
override def parser: SAXParser = {
val f = javax.xml.parsers.SAXParserFactory.newInstance()
f.setNamespaceAware(false)
f.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true);
f.newSAXParser()
}
}
/*
* Format and parse XML data to datasets, ready for further processing using
* spark
@@ -30,7 +42,7 @@ object DataParser {
("postHistory", "/data/stackoverflow/PostHistory","Id PostHistoryTypeId PostId RevisionGUID CreationDate UserId UserDisplayName Comment Text CloseReasonId", Array[DataType](IntegerType, IntegerType, IntegerType,IntegerType, DateType, IntegerType, StringType, StringType, StringType, IntegerType)),
("postLinks", "data/stackoverflow/PostLinks", "Id CreationDate PostId RelatedPostId PostLinkTypeId", Array[DataType](IntegerType, DateType, IntegerType, IntegerType, IntegerType)),
*/
("users", "/data/stackoverflow/Users", "Reputation CreationDate DisplayName EmailHash LastAccessDate WebsiteUrl Location Age AboutMe Views UpVotes DownVotes", Array[DataType](IntegerType, DateType, StringType, StringType, DateType, StringType, StringType, IntegerType, StringType, IntegerType, IntegerType, IntegerType))
("users", "/Users/Work/o/Big_Data_Assignment_2/stackoverflow_dataset/users.txt", "Reputation CreationDate DisplayName EmailHash LastAccessDate WebsiteUrl Location Age AboutMe Views UpVotes DownVotes", Array[DataType](IntegerType, DateType, StringType, StringType, DateType, StringType, StringType, IntegerType, StringType, IntegerType, IntegerType, IntegerType))
/*
("votes", "/data/stackoverflow/Votes", "Id PostId VoteTypeId UserId CreationDate", Array[DataType](IntegerType, IntegerType, IntegerType, IntegerType, DateType))
*/
@@ -102,7 +114,7 @@ object DataParser {
private def ParsingFunc(line: String, schemaString: String, schemaType: Array[DataType]) : Row = {
// Parse line of XML using Scala's built in XML library
val xmlLine = scala.xml.XML.loadString(line)
val xmlLine = MyXML.loadString(line)
var schemaPairs = schemaString.split(" ") zip schemaType
// Create array of values with element for each attribute in schemaString
var lineData = schemaPairs.map { case (fieldName: String, dType: DataType) => castToDType(getXMLAttribute(xmlLine, fieldName), dType) }