Finished graph generation for task 3

2016-11-12 23:57:25 +00:00
parent 162393e416
commit 85e591e230
4 changed files with 43 additions and 22 deletions
@@ -8,7 +8,7 @@
 	<property name="dist" location="dist" />

 	<!--<property name="hadoop.version" value="2.0.0-mr1-cdh4.1.2" /> -->
-    <property name="hadoop.base.path" value="/usr/local/Cellar/hadoop/2.7.3/" />
+    <property name="hadoop.base.path" value="/usr/lib/hadoop/client/" />


 	<path id="classpath">
@@ -28,7 +28,7 @@
 		<!-- Compile the java code from ${src} into ${build} -->
 		<property name="myclasspath" refid="classpath"/>
 		<echo message="Classpath = ${myclasspath}"/>
-		<javac srcdir="${src}" debug="true" destdir="${build}" target="1.8" source="1.8">
+		<javac srcdir="${src}" debug="true" destdir="${build}" target="1.7" source="1.7">
 			<classpath  refid="classpath"/>
 		</javac>
 	</target>
@@ -11,33 +11,17 @@ def main():
        # Store each line as a string in a list
        lines = data.readlines()
        # Get the highest index of tweets
-        indexes = np.array([x.split()[0] for x in lines], dtype=int)
-        max_ind = np.max(indexes)
-        min_ind = np.min(indexes)
+        labels = [x.split()[0] for x in lines]

-        # Create a 2D array of zeros to fill with index-count pairs
-        data = np.zeros([max_ind-min_ind+1, 2], dtype=int)
-        # Fill first column with indexes for each category (1-5, 6-10 etc...)
-        data[:, 0] = np.arange(max_ind-min_ind+1)+1
-
-        labels = [[] for i in xrange(max_ind-min_ind+1)]
        for line in lines:
            # Split the line into it's two components
-            line = line.split()
-            # Get the index stored in component 1
-            ind = int(line[0])-min_ind
-            if ind < 0:
-                pdb.set_trace()
-
-            # Set column two at the index provided to the value provided
-            data[ind][1] = line[-1]
-            labels[ind] = "{0} {1}".format(*line[:-2])
+            line = line.split()[1:]

        # Create labels for each index to show each group's range

        # Plot data...
-        x = data[:, 0]
-        y = data[:, 1]
+        x = labels
+        y = lines[:, 1]
        markerline, stemlines, baseline = plt.stem(x, y, '-')
        plt.xticks(x, labels, rotation='vertical')
        xmin,xmax = plt.xlim()
@@ -0,0 +1,35 @@
+#!/usr/bin/env python
+
+import numpy as np
+import matplotlib.pyplot as plt
+import pdb
+
+
+def main():
+    # Open final output generated from hadoop
+    with open("../FinalOutput.txt") as data:
+        # Store each line as a string in a list
+        lines = data.readlines()
+        # Get the highest index of tweets
+        labels = [x.split('\t')[0] for x in lines]
+
+        # Create labels for each index to show each group's range
+
+        # Plot data...
+        x = np.arange(len(labels))
+        y = np.array([int(z.split('\t')[1]) for z in lines])
+        markerline, stemlines, baseline = plt.stem(x, y, '-')
+        plt.xticks(x, labels, rotation='vertical')
+        xmin,xmax = plt.xlim()
+        xbuff = 0.025*(xmax-xmin)
+        plt.xlim(xmin-xbuff,xmax+xbuff)
+        plt.setp(stemlines, 'color', 'b')
+        plt.yscale("log", nonposy='clip')
+        plt.grid(True)
+        fig = plt.gcf()
+        fig.subplots_adjust(bottom=0.23)
+        plt.show()
+
+
+if __name__ == "__main__":
+    main()
@@ -46,6 +46,8 @@ public class TweetFreqDayMapper extends Mapper<Object, Text, Text, LongWritable>
                out = Denonyms.findDenonym(tweet);
            }
        }
+        System.out.println(out);
+        System.out.println(tweet);
        return out;
    }