ARC2WARC_Hadoop_Jobhdfs_input_path00 HDFS input directory 2014-03-06 09:51:19.728 UTC hdfs:///user/output/directory 2014-03-06 09:51:45.877 UTC hdfs_output_path00 hdfs:///user/input/directory 2014-03-06 09:51:38.197 UTC HDFS input directory 2014-03-06 09:51:22.206 UTC hadoop_fs_ls_STDOUTarc2warc_hdp_STDOUTarc2warc_hdphadoop_job_jar_path0hdfs_input_path0hdfs_output_path0STDOUT00net.sf.taverna.t2.activitiesexternal-tool-activity1.4net.sf.taverna.t2.activities.externaltool.ExternalToolActivity default D0A4CDEB-DD10-4A8E-A49C-8871003083D8 cluster <?xml version="1.0" encoding="UTF-8"?> <sshInvocation><sshNode><host>fue-hdc01</host><port>22</port><directory>/tmp/</directory><linkCommand>/bin/ln -s %%PATH_TO_ORIGINAL%% %%TARGET_NAME%%</linkCommand><copyCommand>/bin/cp %%PATH_TO_ORIGINAL%% %%TARGET_NAME%%</copyCommand></sshNode></sshInvocation> 272a19a9-2322-44bf-af5c-afd14a24dbef # usage: hadoop jar # target/arc2warc-migration-1.0-SNAPSHOT-jar-with-dependencies.jar # [-a <arg>] [-c] [-h] [-i <arg>] [-l] [-o <arg>] [-p] [-t] [-x # <arg>] # -a,--arc2hwar <arg> ARC to HWAR mapping file path. [optional]. # -c,--comprwarc Create compressed WARC file. [optional]. # -h,--help print this message [optional]. # -i,--input <arg> HDFS Input directory with ARC files. [required]. # -l,--local Use local file system instead of HDFS (debugging). # [optional]. # -o,--output <arg> HDFS Output directory where the WARC files will be # stored. [required]. # -p,--payloadid Do payload mime type identification. [optional]. # -t,--localtest Starting application as a local java application # without hadoop (testing). [optional]. # -x,--iregex <arg> Only input paths matching the regular expression # will be processed. [optional]. hadoop jar %%hadoop_job_jar_path%% -i %%hdfs_input_path%% -o %%hdfs_output_path%% 1200 1800 hadoop_job_jar_path hdfs_input_path hdfs_output_path hdfs_output_path hdfs_output_path false false false UTF-8 false false false hdfs_input_path hdfs_input_path false false false UTF-8 false false false hadoop_job_jar_path hadoop_job_jar_path false false false UTF-8 false false false false true true 0 false net.sf.taverna.t2.coreworkflowmodel-impl1.4net.sf.taverna.t2.workflowmodel.processor.dispatch.layers.Parallelize 1 net.sf.taverna.t2.coreworkflowmodel-impl1.4net.sf.taverna.t2.workflowmodel.processor.dispatch.layers.ErrorBouncenet.sf.taverna.t2.coreworkflowmodel-impl1.4net.sf.taverna.t2.workflowmodel.processor.dispatch.layers.Failovernet.sf.taverna.t2.coreworkflowmodel-impl1.4net.sf.taverna.t2.workflowmodel.processor.dispatch.layers.Retry 1.0 1000 5000 0 net.sf.taverna.t2.coreworkflowmodel-impl1.4net.sf.taverna.t2.workflowmodel.processor.dispatch.layers.Invokehadoop_job_jar_pathvalue00net.sf.taverna.t2.activitiesstringconstant-activity1.4net.sf.taverna.t2.activities.stringconstant.StringConstantActivity /home/onbfue/arc2warc-migration-hdp-1.0-jar-with-dependencies.jar net.sf.taverna.t2.coreworkflowmodel-impl1.4net.sf.taverna.t2.workflowmodel.processor.dispatch.layers.Parallelize 1 net.sf.taverna.t2.coreworkflowmodel-impl1.4net.sf.taverna.t2.workflowmodel.processor.dispatch.layers.ErrorBouncenet.sf.taverna.t2.coreworkflowmodel-impl1.4net.sf.taverna.t2.workflowmodel.processor.dispatch.layers.Failovernet.sf.taverna.t2.coreworkflowmodel-impl1.4net.sf.taverna.t2.workflowmodel.processor.dispatch.layers.Retry 1.0 1000 5000 0 net.sf.taverna.t2.coreworkflowmodel-impl1.4net.sf.taverna.t2.workflowmodel.processor.dispatch.layers.Invokehadoop_fs_lshdfs_output_path0STDOUT00net.sf.taverna.t2.activitiesexternal-tool-activity1.4net.sf.taverna.t2.activities.externaltool.ExternalToolActivity 789663B8-DA91-428A-9F7D-B3F3DA185FD4 default local <?xml version="1.0" encoding="UTF-8"?> <localInvocation><shellPrefix>/bin/sh -c</shellPrefix><linkCommand>/bin/ln -s %%PATH_TO_ORIGINAL%% %%TARGET_NAME%%</linkCommand></localInvocation> 72850451-c560-4cb8-843f-08e7eea633d7 hadoop fs -ls %%hdfs_output_path%% 1200 1800 hdfs_output_path hdfs_output_path hdfs_output_path false false false UTF-8 false false false false true true 0 false net.sf.taverna.t2.coreworkflowmodel-impl1.4net.sf.taverna.t2.workflowmodel.processor.dispatch.layers.Parallelize 1 net.sf.taverna.t2.coreworkflowmodel-impl1.4net.sf.taverna.t2.workflowmodel.processor.dispatch.layers.ErrorBouncenet.sf.taverna.t2.coreworkflowmodel-impl1.4net.sf.taverna.t2.workflowmodel.processor.dispatch.layers.Failovernet.sf.taverna.t2.coreworkflowmodel-impl1.4net.sf.taverna.t2.workflowmodel.processor.dispatch.layers.Retry 1.0 1000 5000 0 net.sf.taverna.t2.coreworkflowmodel-impl1.4net.sf.taverna.t2.workflowmodel.processor.dispatch.layers.Invokearc2warc_hdphadoop_job_jar_pathhadoop_job_jar_pathvaluearc2warc_hdphdfs_input_pathhdfs_input_patharc2warc_hdphdfs_output_pathhdfs_output_pathhadoop_fs_lshdfs_output_pathhdfs_output_pathhadoop_fs_ls_STDOUThadoop_fs_lsSTDOUTarc2warc_hdp_STDOUTarc2warc_hdpSTDOUT ARC2WARC Hadoop Job 2014-03-06 09:53:06.302 UTC c65e2854-5451-4d20-8e80-67587b45687a 2014-03-06 09:54:02.565 UTC b7fbec18-07cc-4451-8b40-0c3c1ed9288e 2014-03-06 09:57:15.411 UTC Just a wrapper workflow for a Hadoop job converting ARC to WARC files. 2014-03-06 09:53:29.317 UTC Sven Schlarb 2014-03-06 09:53:09.245 UTC