This workflow generates ePrints XML import files with data set metadata for the FLOSSmole project. It reads in an input file generated from a Notre Dame SourceForge dump SQL query and uses regular expressions to parse the filename for the data set's source repository, download URL, and basic description. It also translates the epoch date into a sql format suitable for import, and the file size from bytes into larger units, e.g. GB, MB, etc. These data are inserted into an XML eprint record template (specific to the FLOSSmole ePrints repository configuration at wp.floss.syr.edu) and the individual eprints are aggregated into an XML import file.
Shim to read in the file, location provided by a string constant.
net.sourceforge.taverna.scuflworkers.io.TextFileReader
Edit to use your local path to the input file location.
/inputfilelocation/inputfile.txt
Takes a flat CSV input file and splits it into a list.
\n
org.embl.ebi.escience.scuflworkers.java.SplitByRegex
Takes the list input and creates a 2-deep list.
,
org.embl.ebi.escience.scuflworkers.java.SplitByRegex
Creates a general description of the data set contents based on regex matching on filenames.
import java.util.regex.Pattern;
//file_name = filename.get(0);
String dm_regex = ".*datamart.*";
String copy_regex = ".*opyright.*";
String lic_regex = ".*icense.*";
String author_regex = ".*uthor.*";
String desc_regex = ".*[d|D]esc.*";
String stats_regex = ".*[s|S]tats.*";
String ossmole_regex = "ossmole.*";
String trove_regex = ".*[t|T]rove.*";
String url_regex = ".*[u|U][r|R][L|l].*";
String date_regex = ".*[d|D]ate.*";
String list_regex = ".*[L|l]ist.*";
String donor_regex = ".*[d|D]onor.*";
String download_regex = ".*[d|D]ownload.*";
String lang_regex = ".*rog[L|l]ang.*|.*anguage.*";
String dbenv_regex = ".*[D|d][b|B][e|E]nv.*";
String rank_regex = ".*[r|R]ank.*";
String forum_regex = ".*[f|F]orum.*";
String tracker_regex = ".*racker.*";
String name_regex = ".*[n|N]ame.*";
String aud_regex = ".*[i|I]nt[a|A]ud.*";
String info_regex = ".*nfo.*";
String topic_regex = ".*[t|T]opic.*";
String status_regex = ".*[S|s]tatus.*";
String os_regex = ".*[o|O]p[s|S]ys.*";
String env_regex = ".*[e|E]nviro.*";
String dev_regex = ".*evelopers.*|.*[D|d]eveloper[d|D]ata.*";
String devproj_regex = ".*[D|d]eveloper[P|p]roject.*|.*[d|D]ev[p|P]roject.*|.*[d|D]eveloper_[p|P]roj.*";
String natlang_regex = ".*[n|N]at[l|L]ang.*";
String userint_regex = ".*[i|I]nterface.*|.*[u|U]ser[I|i]nt.*";
if (Pattern.matches(dm_regex, filename)) {
description = "Datamart package of all data for the project made in this release.";
} else {
if (Pattern.matches(ossmole_regex, filename)) {
description = "Data used to generate the FLOSSmole data products.";
} else {
if (Pattern.matches(dbenv_regex, filename)) {
description = "Database environment information.";
} else {
if (Pattern.matches(lic_regex, filename)) {
description = "Project license information.";
} else {
if (Pattern.matches(author_regex, filename)) {
description = "Project authorship data.";
} else {
if (Pattern.matches(desc_regex, filename)) {
description = "Project descriptive data.";
} else {
if (Pattern.matches(stats_regex, filename)) {
description = "Project statistics.";
} else {
if (Pattern.matches(trove_regex, filename)) {
description = "Classification of topics.";
} else {
if (Pattern.matches(url_regex, filename)) {
description = "Project URLs.";
} else {
if (Pattern.matches(list_regex, filename)) {
description = "Project list for the source repository.";
} else {
if (Pattern.matches(date_regex, filename)) {
description = "Date projects were founded.";
} else {
if (Pattern.matches(donor_regex, filename)) {
description = "Donor information for projects.";
} else {
if (Pattern.matches(download_regex, filename)) {
description = "Download statistics.";
} else {
if (Pattern.matches(lang_regex, filename)) {
description = "Project programming language.";
} else {
if (Pattern.matches(rank_regex, filename)) {
description = "Project ranking within the source repository.";
} else {
if (Pattern.matches(forum_regex, filename)) {
description = "Project forum data.";
} else {
if (Pattern.matches(tracker_regex, filename)) {
description = "Project tracker data.";
} else {
if (Pattern.matches(name_regex, filename)) {
description = "Project names.";
} else {
if (Pattern.matches(aud_regex, filename)) {
description = "Intended audience for the product.";
} else {
if (Pattern.matches(info_regex, filename)) {
description = "Project information.";
} else {
if (Pattern.matches(topic_regex, filename)) {
description = "Project topic information.";
} else {
if (Pattern.matches(status_regex, filename)) {
description = "Project status data.";
} else {
if (Pattern.matches(os_regex, filename)) {
description = "Operating system information.";
} else {
if (Pattern.matches(env_regex, filename)) {
description = "Project development environment.";
} else {
if (Pattern.matches(dev_regex, filename)) {
description = "Project developers.";
} else {
if (Pattern.matches(devproj_regex, filename)) {
description = "Developers and projects association data.";
} else {
if (Pattern.matches(natlang_regex, filename)) {
description = "Project native development language.";
} else {
if (Pattern.matches(userint_regex, filename)) {
description = "User interface information.";
} else {
description = "";
}
}}}}
}}}}
}}}}
}}}
}}}
}}}
}}}
}}}
filename
description
Extracts the repository data source from each filename.
import java.util.regex.Pattern;
//file_name = filename.get(0);
String ow_regex = "^ow.*";
String sf_regex = "^sf.*";
String proj_regex = "^project.*";
String fm_regex = "^fm.*";
String rf_regex = "^rf.*";
String fsf_regex = "^fsf.*";
String debian_regex = "^deb.*";
String ossmole_regex = "^ossmole.*";
String sk_regex = "^sk.*";
String dm_fm_regex = "^datamart_fm.*";
String dm_sf_regex = "^datamart_sf.*";
String dm_ow_regex = "^datamart_ow.*";
String dm_rf_regex = "^datamart_rf.*";
String dm_deb_regex = "^datamart_deb.*";
String dm_fsf_regex = "^datamart_fsf.*";
if (Pattern.matches(ow_regex, filename)) {
source = "ObjectWeb";
} else {
if (Pattern.matches(sf_regex, filename)) {
source = "SourceForge";
} else {
if (Pattern.matches(proj_regex, filename)) {
source = "FLOSSmole";
} else {
if (Pattern.matches(fm_regex, filename)) {
source = "freshmeat";
} else {
if (Pattern.matches(rf_regex, filename)) {
source = "RubyForge";
} else {
if (Pattern.matches(fsf_regex, filename)) {
source = "Free Software Foundation";
} else {
if (Pattern.matches(debian_regex, filename)) {
source = "Debian";
} else {
if (Pattern.matches(sk_regex, filename)) {
source = "Source Kibitzer";
} else {
if (Pattern.matches(ossmole_regex, filename)) {
source = "FLOSSmole";
} else {
if (Pattern.matches(dm_ow_regex, filename)) {
source = "ObjectWeb";
} else {
if (Pattern.matches(dm_sf_regex, filename)) {
source = "SourceForge";
} else {
if (Pattern.matches(dm_fm_regex, filename)) {
source = "freshmeat";
} else {
if (Pattern.matches(dm_rf_regex, filename)) {
source = "RubyForge";
} else {
if (Pattern.matches(dm_fsf_regex, filename)) {
source = "Free Software Foundation";
} else {
if (Pattern.matches(dm_deb_regex, filename)) {
source = "Debian";
} else {
source = "other";
}
}}
}}
}}
}}
}}
}}
}
}
filename
source
Aggregates the individual eprint records into a depositable XML file, configured specifically for the wp.floss.syr.edu ePrints repository.
delim = "\n";
count = eprint.size();
out = "<eprints>" + delim;
for(i = 0; i < count; i++) {
out = out + eprint.get(i);
out = out + delim;
}
out = out + "</eprints>";
import_file = out;
eprint
import_file
Changes the date format from epoch to sql.
import java.text.SimpleDateFormat;
sql = new SimpleDateFormat("yyyy-MM-dd");
epochSecs = Long.parseLong(post_date);
// using the Date(long epochMilliSecs) constructor
epochTrue = epochSecs * 1000;
epochDate = new Date(epochTrue);
date_posted = sql.format(epochDate);
post_date
date_posted
Formats the filesize in bytes into a more human-readable format, conditionally displaying results in GB, MB, KB, or B.
bytes = Integer.parseInt(filesize);
kbytes = bytes / 1000;
mbytes = kbytes / 1000;
gbytes = mbytes / 1000;
if (gbytes >= 1) {
formatted_filesize = +gbytes+" GB";
} else {
if (mbytes >= 1) {
formatted_filesize = +mbytes+" MB";
} else {
if (kbytes >= 1) {
formatted_filesize = +kbytes+" KB";
} else {
formatted_filesize = +bytes+" B";
}
}
}
filesize
formatted_filesize
Constructs the SourceForge file download URL for FLOSSmole data sets, given the name of the files.
url = "http://downloads.sourceforge.net/ossmole/"+filename+"";
filename
url
Uses pattern matching to identify file types in file names.
import java.util.regex.Pattern;
//file_name = filename.get(0);
String bz2_regex = ".*bz2$";
String gz_regex = ".*gz$";
String txt_regex = ".*txt$";
if (Pattern.matches(bz2_regex, filename)) {
filetype = ".bz2";
} else {
if (Pattern.matches(gz_regex, filename)) {
filetype = ".gz";
} else {
if (Pattern.matches(txt_regex, filename)) {
filetype = ".txt";
} else {
filetype = "other";
}
}
}
filename
filetype
Reads the 2-deep input list and splits out the values into separate variables. Not all fields were used.
file_name = file.get(0);
post_date = file.get(3);
filesize = file.get(2);
file
file_name
post_date
filesize
Generates an XML ePrint record based on a template specifically configured for data set deposit in the wp.floss.syr.edu ePrints repository.
eprint_record = "<eprint><userid>232</userid><type>dataset</type><data_type>archive/secondary</data_type><abstract>"+description+"</abstract><creators><item><name><given>Megan</given><family>Squire</family><lineage/><honourific/></name><id>megan@elon.edu</id></item><item><name><given>Kevin</given><family>Crowston</family><lineage/><honourific/></name><id>crowston@syr.edu</id></item><item><name><given>James</given><family>Howison</family><lineage/><honourific/></name><id>jhowison@syr.edu</id></item></creators><producers><item>FLOSSmole</item></producers><title>"+filename+"</title><ispublished>dep</ispublished><output_media>download</output_media><publisher>FLOSSmole</publisher><official_url>"+url+"</official_url><funders><item>US NSF Grant 07-08437</item></funders><data_file_format>"+filetype+"</data_file_format><data_file_size>"+filesize+"</data_file_size><dataset_format>CSV</dataset_format><source_repository>"+source+"</source_repository><note>FLOSS research data mined from source repositories for the FLOSSmole project. Data are freely available for use; citation requested.</note><date>"+post_date+"</date><date_type>deposited</date_type><subjects><item>flossmole</item></subjects></eprint>";
filename
source
filesize
post_date
filetype
url
description
eprint_record
Text output of XML input file for ePrints metadata records.