Created basic word count spark project
Set versions of pom.xml for correct compilation using mvn
This commit is contained in:
+80
@@ -0,0 +1,80 @@
|
||||
Building and Running the Project
|
||||
================================
|
||||
|
||||
These instructions will work using the files at this commit:
|
||||
|
||||
git checkout 800b1f59edaa20a9b65f32a815605307e1102baa
|
||||
|
||||
First, you need to download the small sample of the stack overflow data that
|
||||
can be found here:
|
||||
|
||||
https://drive.google.com/open?id=0B0uip08Km2LPVTFTRFhrdHF2WW8
|
||||
|
||||
Put it in a directory at the project's root called ./stackoverflow_dataset
|
||||
|
||||
Next, the following programs need to be installed on your system (homebrew was
|
||||
used for easy installation on OSX)
|
||||
|
||||
Spark:
|
||||
|
||||
brew install apache-spark
|
||||
|
||||
Scala:
|
||||
|
||||
brew install scala
|
||||
|
||||
Maven:
|
||||
|
||||
brew install maven
|
||||
|
||||
To build and run the project locally you need to set versions in the pom.xml
|
||||
file to match those of the programs installed on your system.
|
||||
the following lines need to be updated in the pom.xml file:
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.scala-lang</groupId>
|
||||
<artifactId>scala-library</artifactId>
|
||||
<version>2.11.8</version> <<<<
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-core_2.11</artifactId> <<<<
|
||||
<version>2.0.2</version> <<<<
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
running:
|
||||
|
||||
spark-shell
|
||||
|
||||
should give you output that will tell you your versions similar to this:
|
||||
|
||||
Welcome to
|
||||
____ __
|
||||
/ __/__ ___ _____/ /__
|
||||
_\ \/ _ \/ _ `/ __/ '_/
|
||||
/___/ .__/\_,_/_/ /_/\_\ version 2.0.2
|
||||
/_/
|
||||
|
||||
Using Scala version 2.11.8 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_92)
|
||||
Type in expressions to have them evaluated.
|
||||
Type :help for more information.
|
||||
|
||||
Having edited this pom.xml file, run the following from the root of the
|
||||
project to compile:
|
||||
|
||||
mvn clean package
|
||||
|
||||
This should run successfully (and will probably download and install a whole bunch of
|
||||
stuff the first time you run it...)
|
||||
|
||||
To run the compiled application:
|
||||
|
||||
cd target
|
||||
spark-submit --class KMeans --master local KMeans-0.0.1.jar
|
||||
|
||||
That should run without errors, producing an output folder. Check that
|
||||
something has been generated by running:
|
||||
|
||||
cat output/part-00000
|
||||
@@ -0,0 +1,92 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!--
|
||||
Copyright (c) 2014, Cloudera, Inc. All Rights Reserved.
|
||||
|
||||
Cloudera, Inc. licenses this file to you under the Apache License,
|
||||
Version 2.0 (the "License"). You may not use this file except in
|
||||
compliance with the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
|
||||
CONDITIONS OF ANY KIND, either express or implied. See the License for
|
||||
the specific language governing permissions and limitations under the
|
||||
License.
|
||||
-->
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<groupId>uk.ac.qmul.spark</groupId>
|
||||
<artifactId>KMeans</artifactId>
|
||||
<version>0.0.1</version>
|
||||
<packaging>jar</packaging>
|
||||
<name>"Spark KMeans Clustering"</name>
|
||||
|
||||
<repositories>
|
||||
<repository>
|
||||
<id>scala-tools.org</id>
|
||||
<name>Scala-tools Maven2 Repository</name>
|
||||
<url>http://scala-tools.org/repo-releases</url>
|
||||
</repository>
|
||||
<repository>
|
||||
<id>maven-hadoop</id>
|
||||
<name>Hadoop Releases</name>
|
||||
<url>https://repository.cloudera.com/content/repositories/releases/</url>
|
||||
</repository>
|
||||
<repository>
|
||||
<id>cloudera-repos</id>
|
||||
<name>Cloudera Repos</name>
|
||||
<url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
|
||||
</repository>
|
||||
</repositories>
|
||||
|
||||
<pluginRepositories>
|
||||
<pluginRepository>
|
||||
<id>scala-tools.org</id>
|
||||
<name>Scala-tools Maven2 Repository</name>
|
||||
<url>http://scala-tools.org/repo-releases</url>
|
||||
</pluginRepository>
|
||||
</pluginRepositories>
|
||||
|
||||
<properties>
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
|
||||
</properties>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.scala-tools</groupId>
|
||||
<artifactId>maven-scala-plugin</artifactId>
|
||||
<version>2.15.2</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<goals>
|
||||
<goal>compile</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<version>3.1</version>
|
||||
<configuration>
|
||||
<source>1.6</source>
|
||||
<target>1.6</target>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.scala-lang</groupId>
|
||||
<artifactId>scala-library</artifactId>
|
||||
<version>2.11.8</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-core_2.11</artifactId>
|
||||
<version>2.0.2</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</project>
|
||||
@@ -0,0 +1,15 @@
|
||||
import org.apache.spark.SparkContext
|
||||
import org.apache.spark.SparkContext._
|
||||
import org.apache.spark._
|
||||
|
||||
object KMeans {
|
||||
/* This is my first java program.
|
||||
* This will print 'Hello World' as the output
|
||||
*/
|
||||
def main(args: Array[String]) {
|
||||
val sc = new SparkContext(new SparkConf().setAppName("Spark KMeans Clustering"))
|
||||
val inputfile = sc.textFile("../stackoverflow_dataset/badges.txt")
|
||||
val counts = inputfile.flatMap(line => line.split(" ")).map(word => (word, 1)).reduceByKey(_+_);
|
||||
counts.saveAsTextFile("output")
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user