Finished graph generation for task 3

This commit is contained in:
Sam Perry
2016-11-12 23:57:25 +00:00
parent 162393e416
commit 85e591e230
4 changed files with 43 additions and 22 deletions
+2 -2
View File
@@ -8,7 +8,7 @@
<property name="dist" location="dist" />
<!--<property name="hadoop.version" value="2.0.0-mr1-cdh4.1.2" /> -->
<property name="hadoop.base.path" value="/usr/local/Cellar/hadoop/2.7.3/" />
<property name="hadoop.base.path" value="/usr/lib/hadoop/client/" />
<path id="classpath">
@@ -28,7 +28,7 @@
<!-- Compile the java code from ${src} into ${build} -->
<property name="myclasspath" refid="classpath"/>
<echo message="Classpath = ${myclasspath}"/>
<javac srcdir="${src}" debug="true" destdir="${build}" target="1.8" source="1.8">
<javac srcdir="${src}" debug="true" destdir="${build}" target="1.7" source="1.7">
<classpath refid="classpath"/>
</javac>
</target>
+4 -20
View File
@@ -11,33 +11,17 @@ def main():
# Store each line as a string in a list
lines = data.readlines()
# Get the highest index of tweets
indexes = np.array([x.split()[0] for x in lines], dtype=int)
max_ind = np.max(indexes)
min_ind = np.min(indexes)
labels = [x.split()[0] for x in lines]
# Create a 2D array of zeros to fill with index-count pairs
data = np.zeros([max_ind-min_ind+1, 2], dtype=int)
# Fill first column with indexes for each category (1-5, 6-10 etc...)
data[:, 0] = np.arange(max_ind-min_ind+1)+1
labels = [[] for i in xrange(max_ind-min_ind+1)]
for line in lines:
# Split the line into it's two components
line = line.split()
# Get the index stored in component 1
ind = int(line[0])-min_ind
if ind < 0:
pdb.set_trace()
# Set column two at the index provided to the value provided
data[ind][1] = line[-1]
labels[ind] = "{0} {1}".format(*line[:-2])
line = line.split()[1:]
# Create labels for each index to show each group's range
# Plot data...
x = data[:, 0]
y = data[:, 1]
x = labels
y = lines[:, 1]
markerline, stemlines, baseline = plt.stem(x, y, '-')
plt.xticks(x, labels, rotation='vertical')
xmin,xmax = plt.xlim()
+35
View File
@@ -0,0 +1,35 @@
#!/usr/bin/env python
import numpy as np
import matplotlib.pyplot as plt
import pdb
def main():
# Open final output generated from hadoop
with open("../FinalOutput.txt") as data:
# Store each line as a string in a list
lines = data.readlines()
# Get the highest index of tweets
labels = [x.split('\t')[0] for x in lines]
# Create labels for each index to show each group's range
# Plot data...
x = np.arange(len(labels))
y = np.array([int(z.split('\t')[1]) for z in lines])
markerline, stemlines, baseline = plt.stem(x, y, '-')
plt.xticks(x, labels, rotation='vertical')
xmin,xmax = plt.xlim()
xbuff = 0.025*(xmax-xmin)
plt.xlim(xmin-xbuff,xmax+xbuff)
plt.setp(stemlines, 'color', 'b')
plt.yscale("log", nonposy='clip')
plt.grid(True)
fig = plt.gcf()
fig.subplots_adjust(bottom=0.23)
plt.show()
if __name__ == "__main__":
main()
+2
View File
@@ -46,6 +46,8 @@ public class TweetFreqDayMapper extends Mapper<Object, Text, Text, LongWritable>
out = Denonyms.findDenonym(tweet);
}
}
System.out.println(out);
System.out.println(tweet);
return out;
}