Created basic word count spark project

Set versions of pom.xml for correct compilation using mvn
This commit is contained in:
Sam Perry
2016-11-30 22:36:39 +00:00
parent 283991656b
commit e824e87c57
3 changed files with 187 additions and 0 deletions
+80
View File
@@ -0,0 +1,80 @@
Building and Running the Project
================================
These instructions will work using the files at this commit:
git checkout 800b1f59edaa20a9b65f32a815605307e1102baa
First, you need to download the small sample of the stack overflow data that
can be found here:
https://drive.google.com/open?id=0B0uip08Km2LPVTFTRFhrdHF2WW8
Put it in a directory at the project's root called ./stackoverflow_dataset
Next, the following programs need to be installed on your system (homebrew was
used for easy installation on OSX)
Spark:
brew install apache-spark
Scala:
brew install scala
Maven:
brew install maven
To build and run the project locally you need to set versions in the pom.xml
file to match those of the programs installed on your system.
the following lines need to be updated in the pom.xml file:
<dependencies>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>2.11.8</version> <<<<
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId> <<<<
<version>2.0.2</version> <<<<
</dependency>
</dependencies>
running:
spark-shell
should give you output that will tell you your versions similar to this:
Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/___/ .__/\_,_/_/ /_/\_\ version 2.0.2
/_/
Using Scala version 2.11.8 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_92)
Type in expressions to have them evaluated.
Type :help for more information.
Having edited this pom.xml file, run the following from the root of the
project to compile:
mvn clean package
This should run successfully (and will probably download and install a whole bunch of
stuff the first time you run it...)
To run the compiled application:
cd target
spark-submit --class KMeans --master local KMeans-0.0.1.jar
That should run without errors, producing an output folder. Check that
something has been generated by running:
cat output/part-00000
+92
View File
@@ -0,0 +1,92 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Copyright (c) 2014, Cloudera, Inc. All Rights Reserved.
Cloudera, Inc. licenses this file to you under the Apache License,
Version 2.0 (the "License"). You may not use this file except in
compliance with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
CONDITIONS OF ANY KIND, either express or implied. See the License for
the specific language governing permissions and limitations under the
License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>uk.ac.qmul.spark</groupId>
<artifactId>KMeans</artifactId>
<version>0.0.1</version>
<packaging>jar</packaging>
<name>"Spark KMeans Clustering"</name>
<repositories>
<repository>
<id>scala-tools.org</id>
<name>Scala-tools Maven2 Repository</name>
<url>http://scala-tools.org/repo-releases</url>
</repository>
<repository>
<id>maven-hadoop</id>
<name>Hadoop Releases</name>
<url>https://repository.cloudera.com/content/repositories/releases/</url>
</repository>
<repository>
<id>cloudera-repos</id>
<name>Cloudera Repos</name>
<url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
</repository>
</repositories>
<pluginRepositories>
<pluginRepository>
<id>scala-tools.org</id>
<name>Scala-tools Maven2 Repository</name>
<url>http://scala-tools.org/repo-releases</url>
</pluginRepository>
</pluginRepositories>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
</properties>
<build>
<plugins>
<plugin>
<groupId>org.scala-tools</groupId>
<artifactId>maven-scala-plugin</artifactId>
<version>2.15.2</version>
<executions>
<execution>
<goals>
<goal>compile</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.1</version>
<configuration>
<source>1.6</source>
<target>1.6</target>
</configuration>
</plugin>
</plugins>
</build>
<dependencies>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>2.11.8</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.0.2</version>
</dependency>
</dependencies>
</project>
+15
View File
@@ -0,0 +1,15 @@
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark._
object KMeans {
/* This is my first java program.
* This will print 'Hello World' as the output
*/
def main(args: Array[String]) {
val sc = new SparkContext(new SparkConf().setAppName("Spark KMeans Clustering"))
val inputfile = sc.textFile("../stackoverflow_dataset/badges.txt")
val counts = inputfile.flatMap(line => line.split(" ")).map(word => (word, 1)).reduceByKey(_+_);
counts.saveAsTextFile("output")
}
}