A heuristic measure for detecting undesired influence of lossy JP2 compression on OCR in the absence of ground truth
Created: 2012-02-06 12:27:23
Last updated: 2012-03-09 14:33:19
The workflow takes TIFF image instances as input, applies a list of JP2 compression parameter values, executes OCR using an open source OCR engine, evaluates the results, and creates a diagram visualising the results.
Dependencies on external tools for the tool service components:
Dependencies on external Java libraries of beanshells:
Preview
Run
Run this Workflow in the Taverna Workbench...
Workflow Components
Authors (1)
Titles (1)
Kakadu encode uncompressed TIF image files (tool service) |
Descriptions (0)
Dependencies (1)
Inputs (4)
Name |
Description |
rates |
Compression rates
|
inFilesAbsPaths |
List of absolute paths to book image files
|
tess_langmod |
Tesseract language module
|
tesscmd |
|
Processors (8)
Name |
Type |
Description |
kakadu_encode |
workflow |
|
createSessionID |
beanshell |
Script//Create a random session id
sessionID = new Random().nextInt(10000000); |
split_rates |
localworker |
ScriptList split = new ArrayList();
if (!string.equals("")) {
String regexString = ",";
if (regex != void) {
regexString = regex;
}
String[] result = string.split(regexString);
for (int i = 0; i < result.length; i++) {
split.add(result[i]);
}
}
|
newline |
stringconstant |
Value\n |
AggregateResults |
localworker |
ScriptString outputFile = "/tmp/"+sessionID+"/data";
BufferedWriter out = new BufferedWriter(new FileWriter(outputFile));
out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputFile), "utf-8"));
int num = resultList.size();
List sublist = resultList.get(0);
int num = resultList.size();
List sublist = (List) resultList.get(0);
int slnum = sublist.size();
for (int i = 0; i < slnum; i++) {
int linesum = 0;
double fileSizeSum = 0;
int totSizeReducedPerRate = 0;
int totSizeOrigPerRate = 0;
for (int j = 0; j < num; j++) {
linesum += Integer.parseInt((String)((List)resultList.get(j)).get(i));
String fsvalStr = (String)((List)fileSizeList.get(j)).get(i);
fsvalStr = fsvalStr.replaceAll("\\s+", "");
int fsval = Integer.parseInt(fsvalStr);
totSizeReducedPerRate += fsval;
String origFsvalStr = (String)((List)origFileSizeList.get(j)).get(i);
origFsvalStr = origFsvalStr.replaceAll("\\s+", "");
totSizeOrigPerRate += Integer.parseInt(origFsvalStr);
}
double bytesReduced = (double)totSizeOrigPerRate - (double)totSizeReducedPerRate;
double percentReduced = (bytesReduced/(double)totSizeOrigPerRate)*100;
double sqDiffSum = 0.0;
/* mean value of levenshtein distance values for one image */
double lineAvg = (double)linesum / (double)num;
for (int j = 0; j < num; j++) {
int currInt = Integer.parseInt((String)((List)resultList.get(j)).get(i));
sqDiffSum += Math.pow((((double)currInt) - lineAvg),2);
}
/* standard deviation of levenshtein distance values for one image */
double lineStdDev = Math.sqrt((double)sqDiffSum / (double)num);
out.write(Integer.toString(i)+" "+Double.toString(lineAvg) + " " +Double.toString(lineStdDev) + " " +Double.toString(percentReduced) + "\n");
}
out.flush();
out.close();
File outFile = new File(outputFile);
num_processed_files = num;
if(outFile.exists()) {
datafile = "file://"+outFile.getAbsolutePath();
} else {
throw new FileNotFoundException();
}
|
ReadDataFile |
localworker |
ScriptBufferedReader getReader (String fileUrl, String encoding) throws IOException {
InputStreamReader reader;
try {
if (encoding == null) {
reader = new FileReader(fileUrl);
} else {
reader = new InputStreamReader(new FileInputStream(fileUrl),encoding);
}
}
catch (FileNotFoundException e) {
// try a real URL instead
URL url = new URL(fileUrl);
if (encoding == null) {
reader = new InputStreamReader (url.openStream());
} else {
reader = new InputStreamReader (url.openStream(), encoding);
}
}
return new BufferedReader(reader);
}
StringBuffer sb = new StringBuffer(4000);
if (encoding == void) {
encoding = null;
}
BufferedReader in = getReader(fileurl, encoding);
String str;
String lineEnding = System.getProperty("line.separator");
while ((str = in.readLine()) != null) {
sb.append(str);
sb.append(lineEnding);
}
in.close();
filecontents = sb.toString();
|
gnuplot |
externaltool |
|
ShowDiagram |
localworker |
Scriptif ((url == void) || (url == null)) {
throw new RuntimeException("The url must be specified");
}
URL inputURL = null;
if (base != void) {
inputURL = new URL(new URL(base), url);
} else {
inputURL = new URL(url);
}
int bytesRead = 0;
int totalBytesRead = 0;
InputStream is = inputURL.openStream();
ByteArrayOutputStream os = new ByteArrayOutputStream();
byte[] buffer = new byte[2048];
while (true) {
bytesRead = is.read(buffer);
if (bytesRead == -1) {
break;
}
os.write(buffer, 0, bytesRead);
}
image = os.toByteArray();
is.close();
os.close();
|
Beanshells (3)
Name |
Description |
Inputs |
Outputs |
createSessionID |
|
|
sessionID
|
create_tmp_environment |
|
inFileAbsPath
rate
sessionID
|
tmpDirAbsPath
tmpFileTrunk
|
CalculateLevenshteinDistance |
|
text1
text2
|
levenshtein_distance
|
Outputs (3)
Name |
Description |
tess_result |
|
datafile |
|
diagram |
|
Datalinks (18)
Source |
Sink |
createSessionID:sessionID |
kakadu_encode:sessionID |
inFilesAbsPaths |
kakadu_encode:inFileAbsPath |
tess_langmod |
kakadu_encode:tess_langmod |
split_rates:split |
kakadu_encode:rate |
tesscmd |
kakadu_encode:tesscmd |
rates |
split_rates:string |
newline:value |
split_rates:regex |
kakadu_encode:levenshtein_distance |
AggregateResults:resultList |
createSessionID:sessionID |
AggregateResults:sessionID |
kakadu_encode:jp2_file_size |
AggregateResults:fileSizeList |
kakadu_encode:orig_file_size |
AggregateResults:origFileSizeList |
AggregateResults:datafile |
ReadDataFile:fileurl |
createSessionID:sessionID |
gnuplot:sessionID |
AggregateResults:num_processed_files |
gnuplot:num_processed_files |
gnuplot:STDOUT |
ShowDiagram:url |
kakadu_encode:tess_result |
tess_result |
ReadDataFile:filecontents |
datafile |
ShowDiagram:image |
diagram |
Coordinations (2)
Controller |
Target |
createSessionID |
kakadu_encode |
AggregateResults |
gnuplot |
Uploader
License
Version 2 (latest)
(of 2)
Credits (1)
(People/Groups)
Attributions (0)
(Workflows/Files)
None
Shared with Groups (1)
Featured In Packs (0)
None
Log in to add to one of your Packs
Attributed By (0)
(Workflows/Files)
None
Favourited By (1)
Statistics
Other workflows that use similar services
(0)
There are no workflows in myExperiment that use similar services to this Workflow.
Comments (0)
No comments yet
Log in to make a comment