Merge branch 'joe-dev2' of https://github.com/Pezz89/Big_Data_Assignment_2 into joe-dev2

This commit is contained in:
joedarby
2016-12-16 12:27:44 +00:00
6 changed files with 509 additions and 75 deletions
+3 -1
View File
@@ -1,6 +1,8 @@
.DS_Store
bin/
/target
target/
.idea/
stackoverflow_data/
# Compiled Object files
*.slo
*.lo
+16
View File
@@ -48,4 +48,20 @@
<component name="ProjectRootManager" version="2" languageLevel="JDK_1_8" default="true" project-jdk-name="1.8 (1)" project-jdk-type="JavaSDK">
<output url="file://$PROJECT_DIR$/out" />
</component>
<component name="masterDetails">
<states>
<state key="ProjectJDKs.UI">
<settings>
<last-edited>1.8</last-edited>
<splitter-proportions>
<option name="proportions">
<list>
<option value="0.2" />
</list>
</option>
</splitter-proportions>
</settings>
</state>
</states>
</component>
</project>
+427 -35
View File
@@ -1,11 +1,16 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ChangeListManager">
<<<<<<< HEAD
<list default="true" id="b41a9788-25b3-4e04-923f-17cde259631b" name="Default" comment="">
<change type="NEW" beforePath="" afterPath="$PROJECT_DIR$/run_project.sh" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/.idea/workspace.xml" afterPath="$PROJECT_DIR$/.idea/workspace.xml" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/src/main/scala/Main.scala" afterPath="$PROJECT_DIR$/src/main/scala/Main.scala" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/src/main/scala/XMLParser.scala" afterPath="$PROJECT_DIR$/src/main/scala/XMLParser.scala" />
=======
<list default="true" id="74fa95ce-dfd4-40da-a7a1-b336badfaea8" name="Default" comment="">
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/src/main/scala/KMeans.scala" afterPath="$PROJECT_DIR$/src/main/scala/KMeans.scala" />
>>>>>>> 23a3c3f3fde97ee499a7fbcbe16e1c28c3297e05
</list>
<ignored path="$PROJECT_DIR$/target/" />
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
@@ -24,8 +29,13 @@
<file leaf-file-name="KMeans.scala" pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/src/main/scala/KMeans.scala">
<provider selected="true" editor-type-id="text-editor">
<<<<<<< HEAD
<state relative-caret-position="684">
<caret line="40" column="3" lean-forward="false" selection-start-line="40" selection-start-column="3" selection-end-line="40" selection-end-column="3" />
=======
<state relative-caret-position="135">
<caret line="36" column="86" lean-forward="true" selection-start-line="36" selection-start-column="86" selection-end-line="36" selection-end-column="86" />
>>>>>>> 23a3c3f3fde97ee499a7fbcbe16e1c28c3297e05
<folding>
<element signature="e#23#54#0" expanded="true" />
</folding>
@@ -33,12 +43,24 @@
</provider>
</entry>
</file>
<<<<<<< HEAD
<file leaf-file-name="Main.scala" pinned="false" current-in-tab="true">
<entry file="file://$PROJECT_DIR$/src/main/scala/Main.scala">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="258">
<caret line="38" column="38" lean-forward="false" selection-start-line="38" selection-start-column="38" selection-end-line="38" selection-end-column="38" />
<folding />
=======
<file leaf-file-name="Row.scala" pinned="false" current-in-tab="false">
<entry file="jar://$MAVEN_REPOSITORY$/org/apache/spark/spark-catalyst_2.10/1.6.0/spark-catalyst_2.10-1.6.0-sources.jar!/org/apache/spark/sql/Row.scala">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="324">
<caret line="38" column="34" lean-forward="false" selection-start-line="38" selection-start-column="34" selection-end-line="38" selection-end-column="34" />
<folding>
<element signature="n#!!doc" expanded="false" />
<element signature="e#832#872#0" expanded="false" />
</folding>
>>>>>>> 23a3c3f3fde97ee499a7fbcbe16e1c28c3297e05
</state>
</provider>
</entry>
@@ -46,8 +68,13 @@
<file leaf-file-name="XMLParser.scala" pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/src/main/scala/XMLParser.scala">
<provider selected="true" editor-type-id="text-editor">
<<<<<<< HEAD
<state relative-caret-position="1044">
<caret line="65" column="35" lean-forward="false" selection-start-line="65" selection-start-column="35" selection-end-line="65" selection-end-column="35" />
=======
<state relative-caret-position="441">
<caret line="135" column="0" lean-forward="true" selection-start-line="135" selection-start-column="0" selection-end-line="135" selection-end-column="0" />
>>>>>>> 23a3c3f3fde97ee499a7fbcbe16e1c28c3297e05
<folding>
<element signature="e#23#59#0" expanded="true" />
</folding>
@@ -55,6 +82,18 @@
</provider>
</entry>
</file>
<file leaf-file-name="Main.scala" pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/src/main/scala/Main.scala">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="419">
<caret line="39" column="28" lean-forward="true" selection-start-line="39" selection-start-column="28" selection-end-line="39" selection-end-column="28" />
<folding>
<element signature="e#22#58#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</file>
</leaf>
</component>
<component name="Git.Settings">
@@ -68,24 +107,35 @@
<component name="IdeDocumentHistory">
<option name="CHANGED_PATHS">
<list>
<<<<<<< HEAD
=======
<option value="$PROJECT_DIR$/src/main/scala/Main.scala" />
<option value="$PROJECT_DIR$/src/main/scala/XMLParser.scala" />
>>>>>>> 23a3c3f3fde97ee499a7fbcbe16e1c28c3297e05
<option value="$PROJECT_DIR$/src/main/scala/KMeans.scala" />
<option value="$PROJECT_DIR$/src/main/scala/XMLParser.scala" />
<option value="$PROJECT_DIR$/src/main/scala/Main.scala" />
</list>
</option>
</component>
<component name="MavenImportPreferences">
<option name="importingSettings">
<MavenImportingSettings>
<option name="importAutomatically" value="true" />
</MavenImportingSettings>
</option>
<component name="JsBuildToolGruntFileManager" detection-done="true" sorting="DEFINITION_ORDER" />
<component name="JsBuildToolPackageJson" detection-done="true" sorting="DEFINITION_ORDER" />
<component name="JsGulpfileManager">
<detection-done>true</detection-done>
<sorting>DEFINITION_ORDER</sorting>
</component>
<component name="ProjectFrameBounds">
<<<<<<< HEAD
<option name="x" value="65" />
<option name="y" value="24" />
<option name="width" value="1295" />
<option name="height" value="744" />
=======
<option name="x" value="77" />
<option name="y" value="122" />
<option name="width" value="1400" />
<option name="height" value="893" />
>>>>>>> 23a3c3f3fde97ee499a7fbcbe16e1c28c3297e05
</component>
<component name="ProjectView">
<navigator currentView="ProjectPane" proportions="" version="1">
@@ -143,6 +193,12 @@
<pane id="Scope" />
</panes>
</component>
<component name="PropertiesComponent">
<property name="WebServerToolWindowFactoryState" value="false" />
<property name="aspect.path.notification.shown" value="true" />
<property name="js.eslint.eslintPackage" value="" />
<property name="last_opened_file_path" value="$PROJECT_DIR$" />
</component>
<component name="RunManager">
<configuration default="true" type="#org.jetbrains.idea.devkit.run.PluginConfigurationType" factoryName="Plugin">
<module name="" />
@@ -229,6 +285,136 @@
<envs />
<method />
</configuration>
<configuration default="true" type="ArquillianJUnit" factoryName="" nameIsGenerated="true">
<extension name="coverage" enabled="false" merge="false" sample_coverage="true" runner="idea" />
<module name="" />
<option name="arquillianRunConfiguration">
<value>
<option name="containerStateName" value="" />
</value>
</option>
<option name="ALTERNATIVE_JRE_PATH_ENABLED" value="false" />
<option name="ALTERNATIVE_JRE_PATH" />
<option name="PACKAGE_NAME" />
<option name="MAIN_CLASS_NAME" />
<option name="METHOD_NAME" />
<option name="TEST_OBJECT" value="class" />
<option name="VM_PARAMETERS" />
<option name="PARAMETERS" />
<option name="WORKING_DIRECTORY" />
<option name="ENV_VARIABLES" />
<option name="PASS_PARENT_ENVS" value="true" />
<option name="TEST_SEARCH_SCOPE">
<value defaultName="singleModule" />
</option>
<envs />
<patterns />
<method />
</configuration>
<configuration default="true" type="ArquillianTestNG" factoryName="">
<extension name="coverage" enabled="false" merge="false" sample_coverage="true" runner="idea" />
<module name="" />
<option name="arquillianRunConfiguration">
<value>
<option name="containerStateName" value="" />
</value>
</option>
<option name="ALTERNATIVE_JRE_PATH_ENABLED" value="false" />
<option name="ALTERNATIVE_JRE_PATH" />
<option name="SUITE_NAME" />
<option name="PACKAGE_NAME" />
<option name="MAIN_CLASS_NAME" />
<option name="METHOD_NAME" />
<option name="GROUP_NAME" />
<option name="TEST_OBJECT" value="CLASS" />
<option name="VM_PARAMETERS" />
<option name="PARAMETERS" />
<option name="WORKING_DIRECTORY" />
<option name="OUTPUT_DIRECTORY" />
<option name="ANNOTATION_TYPE" />
<option name="ENV_VARIABLES" />
<option name="PASS_PARENT_ENVS" value="true" />
<option name="TEST_SEARCH_SCOPE">
<value defaultName="singleModule" />
</option>
<option name="USE_DEFAULT_REPORTERS" value="false" />
<option name="PROPERTIES_FILE" />
<envs />
<properties />
<listeners />
<method />
</configuration>
<configuration default="true" type="Cold Fusion runner description" factoryName="Cold Fusion" custom_browser="" web_path="">
<method />
</configuration>
<configuration default="true" type="CucumberJavaRunConfigurationType" factoryName="Cucumber java">
<extension name="coverage" enabled="false" merge="false" sample_coverage="true" runner="idea" />
<option name="myFilePath" />
<option name="GLUE" />
<option name="myNameFilter" />
<option name="myGeneratedName" />
<option name="MAIN_CLASS_NAME" />
<option name="VM_PARAMETERS" />
<option name="PROGRAM_PARAMETERS" />
<option name="WORKING_DIRECTORY" />
<option name="ALTERNATIVE_JRE_PATH_ENABLED" value="false" />
<option name="ALTERNATIVE_JRE_PATH" />
<option name="ENABLE_SWING_INSPECTOR" value="false" />
<option name="ENV_VARIABLES" />
<option name="PASS_PARENT_ENVS" value="true" />
<module name="" />
<envs />
<method />
</configuration>
<configuration default="true" type="FlashRunConfigurationType" factoryName="Flash App">
<option name="BCName" value="" />
<option name="IOSSimulatorSdkPath" value="" />
<option name="adlOptions" value="" />
<option name="airProgramParameters" value="" />
<option name="appDescriptorForEmulator" value="Android" />
<option name="debugTransport" value="USB" />
<option name="debuggerSdkRaw" value="BC SDK" />
<option name="emulator" value="NexusOne" />
<option name="emulatorAdlOptions" value="" />
<option name="fastPackaging" value="true" />
<option name="fullScreenHeight" value="0" />
<option name="fullScreenWidth" value="0" />
<option name="launchUrl" value="false" />
<option name="launcherParameters">
<LauncherParameters>
<option name="browser" value="a7bb68e0-33c0-4d6f-a81a-aac1fdb870c8" />
<option name="launcherType" value="OSDefault" />
<option name="newPlayerInstance" value="false" />
<option name="playerPath" value="/usr/bin/flashplayerdebugger" />
</LauncherParameters>
</option>
<option name="mobileRunTarget" value="Emulator" />
<option name="moduleName" value="" />
<option name="overriddenMainClass" value="" />
<option name="overriddenOutputFileName" value="" />
<option name="overrideMainClass" value="false" />
<option name="runTrusted" value="true" />
<option name="screenDpi" value="0" />
<option name="screenHeight" value="0" />
<option name="screenWidth" value="0" />
<option name="url" value="http://" />
<option name="usbDebugPort" value="7936" />
<method />
</configuration>
<configuration default="true" type="FlexUnitRunConfigurationType" factoryName="FlexUnit" appDescriptorForEmulator="Android" class_name="" emulatorAdlOptions="" method_name="" package_name="" scope="Class">
<option name="BCName" value="" />
<option name="launcherParameters">
<LauncherParameters>
<option name="browser" value="a7bb68e0-33c0-4d6f-a81a-aac1fdb870c8" />
<option name="launcherType" value="OSDefault" />
<option name="newPlayerInstance" value="false" />
<option name="playerPath" value="/usr/bin/flashplayerdebugger" />
</LauncherParameters>
</option>
<option name="moduleName" value="" />
<option name="trusted" value="true" />
<method />
</configuration>
<configuration default="true" type="GradleRunConfiguration" factoryName="Gradle">
<ExternalSystemSettings>
<option name="executionName" />
@@ -245,6 +431,15 @@
</ExternalSystemSettings>
<method />
</configuration>
<configuration default="true" type="GrailsRunConfigurationType" factoryName="Grails">
<setting name="vmparams" value="" />
<setting name="cmdLine" value="run-app" />
<setting name="passParentEnv" value="true" />
<setting name="launchBrowser" value="true" />
<setting name="launchBrowserUrl" value="" />
<setting name="depsClasspath" value="false" />
<method />
</configuration>
<configuration default="true" type="JUnit" factoryName="JUnit">
<extension name="coverage" enabled="false" merge="false" sample_coverage="true" runner="idea" />
<module name="" />
@@ -287,6 +482,15 @@
<envs />
<method />
</configuration>
<configuration default="true" type="JavaScriptTestRunnerProtractor" factoryName="Protractor">
<config-file value="" />
<node-interpreter value="project" />
<envs />
<method />
</configuration>
<configuration default="true" type="JavascriptDebugType" factoryName="JavaScript Debug">
<method />
</configuration>
<configuration default="true" type="JetRunConfigurationType" factoryName="Kotlin">
<extension name="coverage" enabled="false" merge="false" sample_coverage="true" runner="idea" />
<option name="MAIN_CLASS_NAME" />
@@ -350,6 +554,12 @@
<envs />
<method />
</configuration>
<configuration default="true" type="SpringBootApplicationConfigurationType" factoryName="Spring Boot">
<extension name="coverage" enabled="false" merge="false" sample_coverage="true" runner="idea" />
<module name="" />
<envs />
<method />
</configuration>
<configuration default="true" type="TestNG" factoryName="TestNG">
<extension name="coverage" enabled="false" merge="false" sample_coverage="true" runner="idea" />
<module name="" />
@@ -378,6 +588,28 @@
<listeners />
<method />
</configuration>
<configuration default="true" type="js.build_tools.gulp" factoryName="Gulp.js">
<node-interpreter>project</node-interpreter>
<node-options />
<gulpfile />
<tasks />
<arguments />
<envs />
<method />
</configuration>
<configuration default="true" type="js.build_tools.npm" factoryName="npm">
<command value="run" />
<scripts />
<node-interpreter value="project" />
<envs />
<method />
</configuration>
<configuration default="true" type="osgi.bnd.run" factoryName="Run Launcher">
<method />
</configuration>
<configuration default="true" type="osgi.bnd.run" factoryName="Test Launcher (JUnit)">
<method />
</configuration>
<configuration default="true" type="uTestRunConfiguration" factoryName="utest">
<extension name="coverage" enabled="false" merge="false" sample_coverage="true" runner="idea" />
<module name="" />
@@ -404,47 +636,76 @@
</component>
<component name="TaskManager">
<task active="true" id="Default" summary="Default task">
<changelist id="b41a9788-25b3-4e04-923f-17cde259631b" name="Default" comment="" />
<created>1481799944130</created>
<changelist id="74fa95ce-dfd4-40da-a7a1-b336badfaea8" name="Default" comment="" />
<created>1481830590764</created>
<option name="number" value="Default" />
<option name="presentableId" value="Default" />
<updated>1481799944130</updated>
<updated>1481830590764</updated>
<workItem from="1481830593703" duration="700000" />
<workItem from="1481831304788" duration="5133000" />
<workItem from="1481837779668" duration="4463000" />
</task>
<servers />
</component>
<component name="TimeTrackingManager">
<option name="totallyTimeSpent" value="10296000" />
</component>
<component name="ToolWindowManager">
<<<<<<< HEAD
<frame x="65" y="24" width="1295" height="744" extended-state="6" />
=======
<frame x="77" y="122" width="1400" height="893" extended-state="0" />
>>>>>>> 23a3c3f3fde97ee499a7fbcbe16e1c28c3297e05
<editor active="true" />
<layout>
<window_info id="Palette" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
<window_info id="TODO" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="6" side_tool="false" content_ui="tabs" />
<window_info id="Nl-Palette" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" />
<window_info id="Palette&#9;" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
<window_info id="Image Layers" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" />
<window_info id="Capture Analysis" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
<window_info id="Event Log" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="7" side_tool="true" content_ui="tabs" />
<window_info id="Maven Projects" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
<<<<<<< HEAD
<window_info id="Run" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" />
<window_info id="Version Control" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="7" side_tool="false" content_ui="tabs" />
=======
>>>>>>> 23a3c3f3fde97ee499a7fbcbe16e1c28c3297e05
<window_info id="Properties" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
<window_info id="Terminal" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="7" side_tool="false" content_ui="tabs" />
<window_info id="Capture Tool" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" />
<window_info id="Designer" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" />
<<<<<<< HEAD
<window_info id="Project" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.24555984" sideWeight="0.5" order="0" side_tool="false" content_ui="combo" />
=======
<window_info id="Inspection Results" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.32885087" sideWeight="0.5" order="-1" side_tool="false" content_ui="tabs" />
<window_info id="Database" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
>>>>>>> 23a3c3f3fde97ee499a7fbcbe16e1c28c3297e05
<window_info id="Structure" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
<window_info id="Ant Build" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
<window_info id="UI Designer" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" />
<window_info id="Theme Preview" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
<window_info id="Debug" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
<window_info id="TODO" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="6" side_tool="false" content_ui="tabs" />
<window_info id="Palette&#9;" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
<window_info id="Image Layers" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" />
<window_info id="Java Enterprise" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="7" side_tool="false" content_ui="tabs" />
<window_info id="Capture Analysis" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
<window_info id="Version Control" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="7" side_tool="false" content_ui="tabs" />
<window_info id="Run" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" />
<window_info id="Terminal" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="7" side_tool="false" content_ui="tabs" />
<window_info id="Project" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="0" side_tool="false" content_ui="combo" />
<window_info id="Find" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
<window_info id="Theme Preview" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
<<<<<<< HEAD
<window_info id="Debug" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
=======
>>>>>>> 23a3c3f3fde97ee499a7fbcbe16e1c28c3297e05
<window_info id="Favorites" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="2" side_tool="true" content_ui="tabs" />
<window_info id="Cvs" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="4" side_tool="false" content_ui="tabs" />
<window_info id="Message" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
<window_info id="Commander" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
<window_info id="Hierarchy" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="2" side_tool="false" content_ui="combo" />
<window_info id="Inspection" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="5" side_tool="false" content_ui="tabs" />
<window_info id="Find" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
</layout>
</component>
<component name="TypeScriptGeneratedFilesManager">
<option name="processedProjectFiles" value="true" />
</component>
<component name="VcsContentAnnotationSettings">
<option name="myLimit" value="2678400000" />
</component>
@@ -455,6 +716,7 @@
<component name="editorHistoryManager">
<entry file="file://$PROJECT_DIR$/src/main/scala/KMeans.scala">
<provider selected="true" editor-type-id="text-editor">
<<<<<<< HEAD
<state relative-caret-position="720">
<caret line="40" column="3" lean-forward="false" selection-start-line="40" selection-start-column="3" selection-end-line="40" selection-end-column="3" />
<folding>
@@ -485,24 +747,50 @@
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="0">
<caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
=======
<state relative-caret-position="918">
<caret line="51" column="5" lean-forward="true" selection-start-line="51" selection-start-column="5" selection-end-line="51" selection-end-column="5" />
>>>>>>> 23a3c3f3fde97ee499a7fbcbe16e1c28c3297e05
<folding>
<element signature="e#23#54#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="jar://$MAVEN_REPOSITORY$/org/apache/spark/spark-catalyst_2.10/1.6.0/spark-catalyst_2.10-1.6.0-sources.jar!/org/apache/spark/sql/Row.scala">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="324">
<caret line="38" column="34" lean-forward="false" selection-start-line="38" selection-start-column="34" selection-end-line="38" selection-end-column="34" />
<folding>
<element signature="n#!!doc" expanded="false" />
<element signature="e#832#872#0" expanded="false" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/main/scala/XMLParser.scala">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="270">
<caret line="15" column="7" lean-forward="false" selection-start-line="15" selection-start-column="7" selection-end-line="15" selection-end-column="7" />
<state relative-caret-position="0">
<caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
<folding>
<element signature="e#23#59#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="jar://$MAVEN_REPOSITORY$/org/scala-lang/scala-library/2.10.5/scala-library-2.10.5.jar!/scala/collection/TraversableLike.class">
<entry file="file://$PROJECT_DIR$/src/main/scala/Main.scala">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="180">
<caret line="10" column="3" lean-forward="true" selection-start-line="10" selection-start-column="3" selection-end-line="10" selection-end-column="3" />
<folding>
<element signature="e#22#58#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/main/scala/KMeans.scala">
<provider selected="true" editor-type-id="text-editor">
<<<<<<< HEAD
<state relative-caret-position="168">
<caret line="15" column="6" lean-forward="false" selection-start-line="15" selection-start-column="6" selection-end-line="15" selection-end-column="6" />
</state>
@@ -513,11 +801,19 @@
<state relative-caret-position="245">
<caret line="27" column="27" lean-forward="true" selection-start-line="27" selection-start-column="27" selection-end-line="27" selection-end-column="27" />
<folding />
=======
<state relative-caret-position="360">
<caret line="20" column="31" lean-forward="true" selection-start-line="20" selection-start-column="31" selection-end-line="20" selection-end-column="31" />
<folding>
<element signature="e#23#54#0" expanded="true" />
</folding>
>>>>>>> 23a3c3f3fde97ee499a7fbcbe16e1c28c3297e05
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/main/scala/KMeans.scala">
<provider selected="true" editor-type-id="text-editor">
<<<<<<< HEAD
<state relative-caret-position="396">
<caret line="40" column="3" lean-forward="true" selection-start-line="40" selection-start-column="3" selection-end-line="40" selection-end-column="3" />
<folding>
@@ -530,6 +826,10 @@
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="612">
<caret line="52" column="3" lean-forward="true" selection-start-line="52" selection-start-column="3" selection-end-line="52" selection-end-column="3" />
=======
<state relative-caret-position="0">
<caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
>>>>>>> 23a3c3f3fde97ee499a7fbcbe16e1c28c3297e05
<folding>
<element signature="e#23#59#0" expanded="true" />
</folding>
@@ -575,16 +875,124 @@
</entry>
<entry file="file://$PROJECT_DIR$/src/main/scala/Main.scala">
<provider selected="true" editor-type-id="text-editor">
<<<<<<< HEAD
<state relative-caret-position="245">
<caret line="27" column="27" lean-forward="true" selection-start-line="27" selection-start-column="27" selection-end-line="27" selection-end-column="27" />
=======
<state relative-caret-position="108">
<caret line="13" column="7" lean-forward="false" selection-start-line="13" selection-start-column="7" selection-end-line="13" selection-end-column="7" />
<folding>
<element signature="e#22#58#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="jar://$MAVEN_REPOSITORY$/org/apache/spark/spark-sql_2.10/1.6.0/spark-sql_2.10-1.6.0.jar!/org/apache/spark/sql/DataFrame.class">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="36">
<caret line="2" column="6" lean-forward="false" selection-start-line="2" selection-start-column="6" selection-end-line="2" selection-end-column="6" />
</state>
</provider>
</entry>
<entry file="jar://$MAVEN_REPOSITORY$/org/apache/spark/spark-catalyst_2.10/1.6.0/spark-catalyst_2.10-1.6.0.jar!/org/apache/spark/sql/Row.class">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="18">
<caret line="1" column="6" lean-forward="false" selection-start-line="1" selection-start-column="6" selection-end-line="1" selection-end-column="6" />
</state>
</provider>
</entry>
<entry file="jar://$MAVEN_REPOSITORY$/org/apache/spark/spark-core_2.10/1.6.0/spark-core_2.10-1.6.0.jar!/org/apache/spark/rdd/PairRDDFunctions.class">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="162">
<caret line="9" column="6" lean-forward="false" selection-start-line="9" selection-start-column="6" selection-end-line="9" selection-end-column="6" />
</state>
</provider>
</entry>
<entry file="jar://$MAVEN_REPOSITORY$/org/apache/spark/spark-core_2.10/1.6.0/spark-core_2.10-1.6.0-sources.jar!/org/apache/spark/rdd/PairRDDFunctions.scala">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-2196">
<caret line="57" column="19" lean-forward="false" selection-start-line="57" selection-start-column="19" selection-end-line="57" selection-end-column="19" />
<folding>
<element signature="n#!!doc" expanded="false" />
<element signature="e#832#858#0" expanded="false" />
</folding>
</state>
</provider>
</entry>
<entry file="jar://$MAVEN_REPOSITORY$/org/apache/spark/spark-catalyst_2.10/1.6.0/spark-catalyst_2.10-1.6.0-sources.jar!/org/apache/spark/sql/Row.scala">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="324">
<caret line="38" column="34" lean-forward="false" selection-start-line="38" selection-start-column="34" selection-end-line="38" selection-end-column="34" />
<folding>
<element signature="n#!!doc" expanded="false" />
<element signature="e#832#872#0" expanded="false" />
</folding>
</state>
</provider>
</entry>
<entry file="jar://$MAVEN_REPOSITORY$/org/scala-lang/scala-library/2.10.5/scala-library-2.10.5.jar!/scala/package.class">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="262">
<caret line="26" column="7" lean-forward="false" selection-start-line="26" selection-start-column="7" selection-end-line="26" selection-end-column="7" />
</state>
</provider>
</entry>
<entry file="jar://$MAVEN_REPOSITORY$/org/scala-lang/scala-library/2.10.5/scala-library-2.10.5-sources.jar!/scala/package.scala">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="468">
<caret line="50" column="9" lean-forward="false" selection-start-line="50" selection-start-column="9" selection-end-line="50" selection-end-column="9" />
</state>
</provider>
</entry>
<entry file="jar://$MAVEN_REPOSITORY$/org/apache/spark/spark-sql_2.10/1.6.0/spark-sql_2.10-1.6.0-sources.jar!/org/apache/spark/sql/DataFrame.scala">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1260">
<caret line="112" column="3" lean-forward="false" selection-start-line="112" selection-start-column="3" selection-end-line="112" selection-end-column="3" />
>>>>>>> 23a3c3f3fde97ee499a7fbcbe16e1c28c3297e05
<folding />
</state>
</provider>
</entry>
<entry file="jar:///usr/lib/jvm/java-8-openjdk-amd64/jre/lib/rt.jar!/java/util/Date.class">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="262">
<caret line="434" column="16" lean-forward="false" selection-start-line="434" selection-start-column="16" selection-end-line="434" selection-end-column="16" />
<folding>
<element signature="e#15761#15762#0" expanded="true" />
<element signature="e#15802#15803#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/main/scala/Main.scala">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="419">
<caret line="39" column="28" lean-forward="true" selection-start-line="39" selection-start-column="28" selection-end-line="39" selection-end-column="28" />
<folding>
<element signature="e#22#58#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/main/scala/XMLParser.scala">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="441">
<caret line="135" column="0" lean-forward="true" selection-start-line="135" selection-start-column="0" selection-end-line="135" selection-end-column="0" />
<folding>
<element signature="e#23#59#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/main/scala/KMeans.scala">
<provider selected="true" editor-type-id="text-editor">
<<<<<<< HEAD
<state relative-caret-position="396">
<caret line="40" column="3" lean-forward="false" selection-start-line="40" selection-start-column="3" selection-end-line="40" selection-end-column="3" />
=======
<state relative-caret-position="135">
<caret line="36" column="86" lean-forward="true" selection-start-line="36" selection-start-column="86" selection-end-line="36" selection-end-column="86" />
>>>>>>> 23a3c3f3fde97ee499a7fbcbe16e1c28c3297e05
<folding>
<element signature="e#23#54#0" expanded="true" />
</folding>
@@ -647,20 +1055,4 @@
</provider>
</entry>
</component>
<component name="masterDetails">
<states>
<state key="ProjectJDKs.UI">
<settings>
<last-edited>1.8 (1)</last-edited>
<splitter-proportions>
<option name="proportions">
<list>
<option value="0.2" />
</list>
</option>
</splitter-proportions>
</settings>
</state>
</states>
</component>
</project>
+30 -14
View File
@@ -2,6 +2,7 @@ package ClusterSOData
import org.apache.spark.rdd.RDD
import org.apache.spark.sql._
import scala.collection.mutable.ArrayBuffer
object KMeans {
@@ -10,10 +11,11 @@ object KMeans {
*/
//Create a map to store each data row with its closest cluster index as key
def train(dataset : DataFrame) : RDD[(Int,ArrayBuffer[Float])] = {
def train(dataset : DataFrame) : RDD[(Int,List[Float])] = {
val rows = dataset.rdd
val rowsAsArray = dataset.map(row => List(row.getInt(0).toFloat, row.getInt(1).toFloat, row.getInt(2).toFloat) )
val K = 5 //number of intended clusters
val n = rows.count() //number of datapoints
//val n = rows.count() //number of datapoints
val m = 3 //number of features
//var centres = new ArrayBuffer[Row]
@@ -24,23 +26,24 @@ object KMeans {
for (a <- 0 until K) {
centres(a) = rows(r.ne
}*/
val centres = rows.takeSample(false, K, System.nanoTime().toInt)
val clusterMap :RDD[(Int,Row)]= rows.map(row => (assignCluster(row,centres,m,K),row))
//val centres = rowsAsArray.takeSample(false, K, System.nanoTime().toInt)
val centres : Array[List[Float]] = Array(List(0.0f, 0.0f, 0.0f), List(10.0f, 10.0f, 10.0f), List(20.0f, 20.0f, 20.0f))
val clusterMap :RDD[(Int,List[Float])]= rowsAsArray.map(row => (assignCluster(row,centres,m,K),row))
val newCentres = calculateNewCentres(clusterMap)
newCentres
}
def calculateNorm(datapoint : Row, centre : Row, m: Int): Double = {
def calculateNorm(datapoint : List[Float], centre : List[Float], m: Int): Double = {
var norm : Double = 0.0
for (a <- 0 to m) {
norm = norm + Math.pow(datapoint.getFloat(a) - centre.getFloat(a), 2.0)
for (a <- 0 until m) {
norm = norm + Math.pow(datapoint(a) - centre(a), 2.0)
}
norm = Math.pow(norm, 0.5)
norm
}
def assignCluster(row : Row, centres: Array[Row], m : Int, K :Int): Int = {
def assignCluster(row : List[Float], centres: Array[List[Float]], m : Int, K :Int): Int = {
var smallestNorm = 99999999999.0
var closestCentre = 0
for (centreNumber <- 0 until K) {
@@ -53,14 +56,17 @@ object KMeans {
closestCentre
}
def calculateNewCentres(clusterMap : RDD[(Int,Row)]): RDD[(Int,ArrayBuffer[Float])] = {
val data = clusterMap.map(x => (x._1, x._2.asInstanceOf[ArrayBuffer[Float]]))
val newCentres = data.reduceByKey((a, b) => averageRow(a, b))
def calculateNewCentres(clusterMap : RDD[(Int,List[Float])]): RDD[(Int,List[Float])] = {
//val data = clusterMap.map(x => (x._1, x._2.asInstanceOf[ArrayBuffer[Double]]))
val newCentres = clusterMap.reduceByKey((a, b) => averageRow(a, b))
//val singleCluster = clusterMap.filter(x => x._1 == 0)
//val singleClusterAsArray = singleCluster.reduce()
newCentres
}
/*for (a <- 0 until K) {
var cluster = clusterMap.filter{case (a,_) => a == 0}
var data = cluster.map((_,a) => a :Row)*/
@@ -73,12 +79,22 @@ object KMeans {
return features
}*/
def averageRow(a :ArrayBuffer[Float], b:ArrayBuffer[Float]) : ArrayBuffer[Float] = {
val newRow = new ArrayBuffer[Float]()
for (i <- 0 until a.length) {
/*def averageRow(a :ArrayBuffer[Float], b:ArrayBuffer[Float]) : ArrayBuffer[Float] = {
val newRow = Row.apply()
for (i <- a.indices) {
val avgI = (a(i) + b(i)) /2
newRow(i) = avgI
}
newRow
}*/
def averageRow(a:List[Float], b:List[Float]) : List[Float] = {
val means = new ArrayBuffer[Float]
for (i <- 0 until a.size) {
val mean = (a(i) + b(i)) /2.0f
means(i) = mean
}
return means.toList
}
}
+11 -4
View File
@@ -22,17 +22,23 @@ object Main {
def main(args: Array[String]) {
// Retrieve data from StackOverflow dataset XMLs. Format into DataFrames
// for easy access to data elements.
val dataFrames = DataParser.ParseData()
val df = XMLParser.ParseData()
// get the users XML file
val users = dataFrames("users")
val users = df("users")
val centres = KMeans.train(users)
val centresArray = centres.collect()
val unwrap = centresArray.map(x => x._2)
unwrap.foreach(println)
/*val users = dataFrames("users")
/*val dataFrames = DataParser.ParseData()
// get the users XML file
val users = dataFrames("users")
users.persist()
// Show 20 entries from the user dataset
users.show()
@@ -42,9 +48,10 @@ object Main {
// create new dataframe with only the reputation of the users
users.select("CreationDate").show()
// Info on using DataFrames here: https://www.mapr.com/blog/using-apache-spark-dataframes-processing-tabular-data
*/
// Info on using DataFrames here: https://www.mapr.com/blog/using-apache-spark-dataframes-processing-tabular-data
}
}
+22 -21
View File
@@ -13,7 +13,7 @@ import org.apache.spark.sql.types._
* Format and parse XML data to datasets, ready for further processing using
* spark
*/
object DataParser {
object XMLParser {
/*
* Generate array of DataFrames from XML content
@@ -23,14 +23,18 @@ object DataParser {
// Define XML file locations and a string of attribute tags to retrieve
// from each xml element.
val xmlInfos = Array(
("badges", "../stackoverflow_dataset/badges.txt", "Id UserId Name Date", Array[DataType](IntegerType, IntegerType, StringType, DateType)),
("comments", "../stackoverflow_dataset/comments.txt", "Id PostId Score Text CreationDate UserId", Array[DataType](IntegerType, IntegerType, IntegerType, StringType, DateType, IntegerType)),
("posts", "../stackoverflow_dataset/posts.txt", "Id PostTypeId ParentID AcceptedAnswerId CreationDate Score ViewCount Body OwnerUserId LastEditorUserId LastEditorDisplayName LastEditDate LastActivityDate CommunityOwnedDate ClosedDate Title Tags AnswerCount CommentCount FavoriteCount", Array[DataType](IntegerType, IntegerType, IntegerType, IntegerType, DateType, IntegerType, IntegerType, StringType, IntegerType, IntegerType, StringType, DateType, DateType, DateType, DateType, StringType, StringType, IntegerType, IntegerType, IntegerType)),
("postHistory", "../stackoverflow_dataset/postHistory.txt","Id PostHistoryTypeId PostId RevisionGUID CreationDate UserId UserDisplayName Comment Text CloseReasonId", Array[DataType](IntegerType, IntegerType, IntegerType,IntegerType, DateType, IntegerType, StringType, StringType, StringType, IntegerType)),
("postLinks", "../stackoverflow_dataset/postLinks.txt", "Id CreationDate PostId RelatedPostId PostLinkTypeId", Array[DataType](IntegerType, DateType, IntegerType, IntegerType, IntegerType)),
("users", "../stackoverflow_dataset/users.txt", "Reputation CreationDate DisplayName EmailHash LastAccessDate WebsiteUrl Location Age AboutMe Views UpVotes DownVotes", Array[DataType](IntegerType, DateType, StringType, StringType, DateType, StringType, StringType, IntegerType, StringType, IntegerType, IntegerType, IntegerType)),
("votes", "../stackoverflow_dataset/votes.txt", "Id PostId VoteTypeId UserId CreationDate", Array[DataType](IntegerType, IntegerType, IntegerType, IntegerType, DateType))
)
/*
("badges", "/data/stackoverflow/Badges", "Id UserId Name Date", Array[DataType](IntegerType, IntegerType, StringType, DateType)),
("comments", "/data/stackoverflow/Comments", "Id PostId Score Text CreationDate UserId", Array[DataType](IntegerType, IntegerType, IntegerType, StringType, DateType, IntegerType)),
("posts", "data/stackoverflow/Posts", "Id PostTypeId ParentID AcceptedAnswerId CreationDate Score ViewCount Body OwnerUserId LastEditorUserId LastEditorDisplayName LastEditDate LastActivityDate CommunityOwnedDate ClosedDate Title Tags AnswerCount CommentCount FavoriteCount", Array[DataType](IntegerType, IntegerType, IntegerType, IntegerType, DateType, IntegerType, IntegerType, StringType, IntegerType, IntegerType, StringType, DateType, DateType, DateType, DateType, StringType, StringType, IntegerType, IntegerType, IntegerType)),
("postHistory", "/data/stackoverflow/PostHistory","Id PostHistoryTypeId PostId RevisionGUID CreationDate UserId UserDisplayName Comment Text CloseReasonId", Array[DataType](IntegerType, IntegerType, IntegerType,IntegerType, DateType, IntegerType, StringType, StringType, StringType, IntegerType)),
("postLinks", "data/stackoverflow/PostLinks", "Id CreationDate PostId RelatedPostId PostLinkTypeId", Array[DataType](IntegerType, DateType, IntegerType, IntegerType, IntegerType)),
*/
("users", "stackoverflow_dataset/users.txt", "Reputation CreationDate DisplayName EmailHash LastAccessDate WebsiteUrl Location Age AboutMe Views UpVotes DownVotes", Array[DataType](IntegerType, DateType, StringType, StringType, DateType, StringType, StringType, IntegerType, StringType, IntegerType, IntegerType, IntegerType))
/*
("votes", "/data/stackoverflow/Votes", "Id PostId VoteTypeId UserId CreationDate", Array[DataType](IntegerType, IntegerType, IntegerType, IntegerType, DateType))
*/
)
// Store each file's DataFrame in an array of DataFrames.
val parsedData = xmlInfos.map(x => (x._1, ParseXMLInfo((x._2, x._3, x._4)))).toMap
@@ -98,18 +102,12 @@ object DataParser {
private def ParsingFunc(line: String, schemaString: String, schemaType: Array[DataType]) : Row = {
// Parse line of XML using Scala's built in XML library
try {
val xmlLine = scala.xml.XML.loadString(line)
var schemaPairs = schemaString.split(" ") zip schemaType
// Create array of values with element for each attribute in schemaString
var lineData = schemaPairs.map { case (fieldName: String, dType: DataType) => castToDType(getXMLAttribute(xmlLine, fieldName), dType) }
val xmlLine = scala.xml.XML.loadString(line)
var schemaPairs = schemaString.split(" ") zip schemaType
// Create array of values with element for each attribute in schemaString
var lineData = schemaPairs.map { case (fieldName: String, dType: DataType) => castToDType(getXMLAttribute(xmlLine, fieldName), dType) }
return Row.fromSeq(lineData)
} catch {
case e:Exception=>
println(line)
throw new Exception("failed to load")
}
return Row.fromSeq(lineData)
}
/*
@@ -129,7 +127,10 @@ object DataParser {
case DateType =>
// If the string is a date, convert from date string to long.
var format = new java.text.SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS")
return format.parse(attribute).getTime()
var longTime = format.parse(attribute).getTime()
// Then convert long to int representing days since epoch
var longDays : Long = longTime / (1000*60*60*24)
return longDays.toInt
}
}