Merge branch 'sam-dev'

2016-12-14 14:42:46 +00:00
parent 283991656b b41d085184
commit a19d05cebb
7 changed files with 337 additions and 0 deletions
@@ -35,3 +35,5 @@ bin/

 external/
 CMakeFiles/
+
+mlib.scala
@@ -0,0 +1 @@
+*.scala
@@ -0,0 +1,80 @@
+Building and Running the Project
+================================
+
+These instructions will work using the files at this commit:
+    
+    git checkout 800b1f59edaa20a9b65f32a815605307e1102baa
+
+First, you need to download the small sample of the stack overflow data that
+can be found here:
+
+https://drive.google.com/open?id=0B0uip08Km2LPVTFTRFhrdHF2WW8
+
+Put it in a directory at the project's root called ./stackoverflow_dataset
+
+Next, the following programs need to be installed on your system (homebrew was
+used for easy installation on OSX)
+
+Spark:
+
+    brew install apache-spark
+
+Scala:
+
+    brew install scala
+
+Maven:
+
+    brew install maven
+
+To build and run the project locally you need to set versions in the pom.xml
+file to match those of the programs installed on your system.
+the following lines need to be updated in the pom.xml file:
+
+    <dependencies>
+      <dependency>
+        <groupId>org.scala-lang</groupId>
+        <artifactId>scala-library</artifactId>
+        <version>2.11.8</version>                     <<<<
+      </dependency>
+      <dependency>
+        <groupId>org.apache.spark</groupId>
+        <artifactId>spark-core_2.11</artifactId>      <<<<
+        <version>2.0.2</version>                      <<<<
+      </dependency>
+    </dependencies>
+
+running:
+
+    spark-shell
+
+should give you output that will tell you your versions similar to this:
+
+    Welcome to
+          ____              __
+         / __/__  ___ _____/ /__
+        _\ \/ _ \/ _ `/ __/  '_/
+       /___/ .__/\_,_/_/ /_/\_\   version 2.0.2
+          /_/
+
+    Using Scala version 2.11.8 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_92)
+    Type in expressions to have them evaluated.
+    Type :help for more information.
+
+Having edited this pom.xml file, run the following from the root of the
+project to compile:
+
+    mvn clean package
+
+This should run successfully (and will probably download and install a whole bunch of
+stuff the first time you run it...)
+
+To run the compiled application:
+
+    cd target
+    spark-submit --class ClusterSOData.Main --master local KMeans-0.0.1.jar
+
+That should run without errors, producing an output folder. Check that
+something has been generated by running:
+
+    cat output/part-00000
@@ -0,0 +1,97 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Copyright (c) 2014, Cloudera, Inc. All Rights Reserved.
+
+  Cloudera, Inc. licenses this file to you under the Apache License,
+  Version 2.0 (the "License"). You may not use this file except in
+  compliance with the License. You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+  CONDITIONS OF ANY KIND, either express or implied. See the License for
+  the specific language governing permissions and limitations under the
+  License.
+  -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <groupId>uk.ac.qmul.spark</groupId>
+  <artifactId>KMeans</artifactId>
+  <version>0.0.1</version>
+  <packaging>jar</packaging>
+  <name>"Spark KMeans Clustering"</name>
+  
+  <repositories>
+    <repository>
+      <id>scala-tools.org</id>
+      <name>Scala-tools Maven2 Repository</name>
+      <url>http://scala-tools.org/repo-releases</url>
+    </repository>
+    <repository>
+      <id>maven-hadoop</id>
+      <name>Hadoop Releases</name>
+      <url>https://repository.cloudera.com/content/repositories/releases/</url>
+    </repository>
+    <repository>
+      <id>cloudera-repos</id>
+      <name>Cloudera Repos</name>
+      <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
+    </repository>
+  </repositories>
+
+  <pluginRepositories>
+    <pluginRepository>
+      <id>scala-tools.org</id>
+      <name>Scala-tools Maven2 Repository</name>
+      <url>http://scala-tools.org/repo-releases</url>
+    </pluginRepository>
+  </pluginRepositories>
+
+  <properties>
+    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
+  </properties>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.scala-tools</groupId>
+        <artifactId>maven-scala-plugin</artifactId>
+        <version>2.15.2</version>
+        <executions>
+          <execution>
+            <goals>
+              <goal>compile</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+      <plugin>
+        <artifactId>maven-compiler-plugin</artifactId>
+        <version>3.1</version>
+        <configuration>
+          <source>1.6</source>
+          <target>1.6</target>
+        </configuration>
+      </plugin>
+    </plugins>  
+  </build>
+
+  <dependencies>
+    <dependency>
+      <groupId>org.scala-lang</groupId>
+      <artifactId>scala-library</artifactId>
+      <version>2.10.5</version>
+    </dependency>
+    <dependency>
+        <groupId>org.apache.spark</groupId>
+        <artifactId>spark-core_2.10</artifactId>
+        <version>1.6.0</version>
+    </dependency>
+    <dependency>
+        <groupId>org.apache.spark</groupId>
+        <artifactId>spark-sql_2.10</artifactId>
+        <version>1.6.0</version>
+    </dependency>
+  </dependencies>
+</project>
@@ -0,0 +1,19 @@
+package ClusterSOData
+
+import org.apache.spark.SparkContext 
+import org.apache.spark.SparkContext._ 
+import org.apache.spark._
+import org.apache.spark.sql._
+
+object KMeans {
+   /**
+    * Run KMeans clustering on an input RDD vector
+   */
+  def train(
+    //data: DataSet
+  ) 
+  {
+    //Normalise data using Euclidean normalisation
+
+  }
+}
@@ -0,0 +1,31 @@
+package ClusterSOData
+import org.apache.spark.SparkContext 
+import org.apache.spark.SparkContext._ 
+import org.apache.spark._
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.SQLContext._
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql._
+import org.apache.spark.sql.types._
+
+/* 
+ * Run KMeans clustering on the StackOverflow dataset
+ */
+object Main {
+  // Initialize spark and SQL to allow for processing of structured data in a
+  // spark cluster
+  val sc = new SparkContext(new SparkConf().setAppName("Spark KMeans Clustering"))
+  val sqlContext= new org.apache.spark.sql.SQLContext(sc)
+  import sqlContext.implicits._
+
+  // Main function for task execution
+  def main(args: Array[String]) {
+    // Retrieve data from StackOverflow dataset XMLs. Format into DataFrames
+    // for easy access to data elements.
+    val df = DataParser.ParseData()
+
+    // get the users XML file
+    val users = df("users")
+    users.show()
+  }
+}
@@ -0,0 +1,107 @@
+package ClusterSOData
+
+import org.apache.spark.SparkContext 
+import org.apache.spark.SparkContext._ 
+import org.apache.spark._
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.SQLContext._
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql._
+import org.apache.spark.sql.types._
+
+/* 
+ * Format and parse XML data to datasets, ready for further processing using
+ * spark
+ */
+object DataParser {
+
+  /*
+   * Generate array of DataFrames from XML content
+   */
+  def ParseData() : Map[String, DataFrame] = {
+
+    // Define XML file locations and a string of attribute tags to retrieve
+    // from each xml element.
+    val xmlInfos = Array(
+      ("badges", "../stackoverflow_dataset/badges.txt", "Id UserId Name Date"),
+      ("comments", "../stackoverflow_dataset/comments.txt", "Id PostId Score Text CreationDate UserId"),
+      ("posts", "../stackoverflow_dataset/posts.txt", "Id PostTypeId ParentID AcceptedAnswerId CreationDate Score ViewCount Body OwnerUserId LastEditorUserId LastEditorDisplayName LastEditDate LastActivityDate CommunityOwnedDate ClosedDate Title Tags AnswerCount CommentCount FavoriteCount"),
+      ("postHistory", "../stackoverflow_dataset/postHistory.txt","Id PostHistoryTypeId PostId RevisionGUID CreationDate UserId UserDisplayName Comment Text CloseReasonId"),
+      ("postLinks", "../stackoverflow_dataset/postLinks.txt", "Id CreationDate PostId RelatedPostId PostLinkTypeId"),
+      ("users", "../stackoverflow_dataset/users.txt", "Reputation CreationDate DisplayName EmailHash LastAccessDate WebsiteUrl Location Age AboutMe Views UpVotes DownVotes"),
+      ("votes", "../stackoverflow_dataset/votes.txt", "Id PostId VoteTypeId UserId CreationDate")
+    )
+    
+    // Store each file's DataFrame in an array of DataFrames.
+    val parsedData = xmlInfos.map(x => (x._1, ParseXMLInfo((x._2, x._3)))).toMap
+
+    return parsedData
+  }
+
+  private def ParseXMLInfo(xmlInfo: (String, String)) : DataFrame = {
+    // Get the XML attributes used for generating the table columns
+    var schemaString = xmlInfo._2
+    // Generate schema using XML attribute string
+    var schema = GenerateSchemaFromString(schemaString)
+    // Generate RDD of data from the XML file
+    var rdd = ParseInput(xmlInfo._1, schemaString)
+    // Convert RDD to DataFrame for easier processing
+    var data = Main.sqlContext.createDataFrame(rdd, schema)
+
+    return data
+
+  }
+
+  /*
+   * Generate a schema based on the string of XML attributes
+   */
+  private def GenerateSchemaFromString(schemaString: String) : StructType = {
+    val fields = schemaString.split(" ")
+      .map(fieldName => StructField(fieldName, StringType, nullable = true))
+    val schema = StructType(fields)
+    return schema
+  }
+
+  /*
+   * Create RDD from XML file
+   *
+   * inputFilepath: Filepath to XML file
+   * schemaString: Space seperated attribute values
+   */
+  private def ParseInput(inputFilepath: String, schemaString: String) : RDD[Row] = {
+    // Create spark text file object
+    val inputFile = Main.sc.textFile(inputFilepath)
+
+    // Map the input file data to an RDD
+    val Data = inputFile.map(line => ParsingFunc(line, schemaString))
+    return Data
+  }
+
+  /*
+   * Retrieve XML attributes from a String
+   *
+   * line: XML file line
+   * schemaString: Space seperated attribute values
+   */
+  private def ParsingFunc(line: String, schemaString: String) : Row = {
+    // Parse line of XML using Scala's built in XML library
+    val xmlLine = scala.xml.XML.loadString(line)
+    // Create array of values with element for each attribute in schemaString
+    var lineData = schemaString.split(" ").map(fieldName => getXMLAttribute(xmlLine, fieldName))
+
+    return Row.fromSeq(lineData)
+  }
+
+  /*
+   * Handle NullPointerError raised when an attribute doesn't exist
+   *
+   * Return an empty string if the attribute doesn't exist
+   */
+  private def getXMLAttribute(xmlLine: scala.xml.Elem, attribute: String) : String = {
+    try { 
+      return xmlLine.attributes(attribute).text
+    } catch {
+      case npe: NullPointerException => return ""
+    }
+  }
+}