Created basic word count spark project

Set versions of pom.xml for correct compilation using mvn
2016-11-30 22:36:39 +00:00
parent 283991656b
commit e824e87c57
3 changed files with 187 additions and 0 deletions
@@ -0,0 +1,80 @@
+Building and Running the Project
+================================
+
+These instructions will work using the files at this commit:
+    
+    git checkout 800b1f59edaa20a9b65f32a815605307e1102baa
+
+First, you need to download the small sample of the stack overflow data that
+can be found here:
+
+https://drive.google.com/open?id=0B0uip08Km2LPVTFTRFhrdHF2WW8
+
+Put it in a directory at the project's root called ./stackoverflow_dataset
+
+Next, the following programs need to be installed on your system (homebrew was
+used for easy installation on OSX)
+
+Spark:
+
+    brew install apache-spark
+
+Scala:
+
+    brew install scala
+
+Maven:
+
+    brew install maven
+
+To build and run the project locally you need to set versions in the pom.xml
+file to match those of the programs installed on your system.
+the following lines need to be updated in the pom.xml file:
+
+    <dependencies>
+      <dependency>
+        <groupId>org.scala-lang</groupId>
+        <artifactId>scala-library</artifactId>
+        <version>2.11.8</version>                     <<<<
+      </dependency>
+      <dependency>
+        <groupId>org.apache.spark</groupId>
+        <artifactId>spark-core_2.11</artifactId>      <<<<
+        <version>2.0.2</version>                      <<<<
+      </dependency>
+    </dependencies>
+
+running:
+
+    spark-shell
+
+should give you output that will tell you your versions similar to this:
+
+    Welcome to
+          ____              __
+         / __/__  ___ _____/ /__
+        _\ \/ _ \/ _ `/ __/  '_/
+       /___/ .__/\_,_/_/ /_/\_\   version 2.0.2
+          /_/
+
+    Using Scala version 2.11.8 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_92)
+    Type in expressions to have them evaluated.
+    Type :help for more information.
+
+Having edited this pom.xml file, run the following from the root of the
+project to compile:
+
+    mvn clean package
+
+This should run successfully (and will probably download and install a whole bunch of
+stuff the first time you run it...)
+
+To run the compiled application:
+
+    cd target
+    spark-submit --class KMeans --master local KMeans-0.0.1.jar
+
+That should run without errors, producing an output folder. Check that
+something has been generated by running:
+
+    cat output/part-00000
@@ -0,0 +1,92 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Copyright (c) 2014, Cloudera, Inc. All Rights Reserved.
+
+  Cloudera, Inc. licenses this file to you under the Apache License,
+  Version 2.0 (the "License"). You may not use this file except in
+  compliance with the License. You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+  CONDITIONS OF ANY KIND, either express or implied. See the License for
+  the specific language governing permissions and limitations under the
+  License.
+  -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <groupId>uk.ac.qmul.spark</groupId>
+  <artifactId>KMeans</artifactId>
+  <version>0.0.1</version>
+  <packaging>jar</packaging>
+  <name>"Spark KMeans Clustering"</name>
+  
+  <repositories>
+    <repository>
+      <id>scala-tools.org</id>
+      <name>Scala-tools Maven2 Repository</name>
+      <url>http://scala-tools.org/repo-releases</url>
+    </repository>
+    <repository>
+      <id>maven-hadoop</id>
+      <name>Hadoop Releases</name>
+      <url>https://repository.cloudera.com/content/repositories/releases/</url>
+    </repository>
+    <repository>
+      <id>cloudera-repos</id>
+      <name>Cloudera Repos</name>
+      <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
+    </repository>
+  </repositories>
+
+  <pluginRepositories>
+    <pluginRepository>
+      <id>scala-tools.org</id>
+      <name>Scala-tools Maven2 Repository</name>
+      <url>http://scala-tools.org/repo-releases</url>
+    </pluginRepository>
+  </pluginRepositories>
+
+  <properties>
+    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
+  </properties>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.scala-tools</groupId>
+        <artifactId>maven-scala-plugin</artifactId>
+        <version>2.15.2</version>
+        <executions>
+          <execution>
+            <goals>
+              <goal>compile</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+      <plugin>
+        <artifactId>maven-compiler-plugin</artifactId>
+        <version>3.1</version>
+        <configuration>
+          <source>1.6</source>
+          <target>1.6</target>
+        </configuration>
+      </plugin>
+    </plugins>  
+  </build>
+
+  <dependencies>
+    <dependency>
+      <groupId>org.scala-lang</groupId>
+      <artifactId>scala-library</artifactId>
+      <version>2.11.8</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-core_2.11</artifactId>
+      <version>2.0.2</version>
+    </dependency>
+  </dependencies>
+</project>
@@ -0,0 +1,15 @@
+import org.apache.spark.SparkContext 
+import org.apache.spark.SparkContext._ 
+import org.apache.spark._
+
+object KMeans {
+   /* This is my first java program.  
+   * This will print 'Hello World' as the output
+   */
+    def main(args: Array[String]) {
+      val sc = new SparkContext(new SparkConf().setAppName("Spark KMeans Clustering"))
+      val inputfile = sc.textFile("../stackoverflow_dataset/badges.txt")
+      val counts = inputfile.flatMap(line => line.split(" ")).map(word => (word, 1)).reduceByKey(_+_);
+      counts.saveAsTextFile("output")
+   }
+}