This workflow uses one or more services that
are deprecated as of 31st December 2012
(about 12 years ago), and may no longer function.
Show details...
Affected service WSDL:
- http://soap.genome.jp/KEGG.wsdl
Details:
KEGG will be moving from a WSDL/SOAP interface to REST. Details of the new REST services can be found here.
Working examples that use the new REST service can be viewed here, here and here.
FINAL VERSION
Created: 2012-05-04 12:42:03
Last updated: 2012-05-04 13:00:37
Preview
Run
Run this Workflow in the Taverna Workbench...
Workflow Components
Authors (0)
Titles (0)
Descriptions (0)
Dependencies (0)
Processors (14)
Name |
Type |
Description |
transeq |
soaplab |
Endpointhttp://www.ebi.ac.uk/soaplab/services/nucleic_translation.transeq |
email |
stringconstant |
Valueg.georgiou@newcastle.ac.uk |
BLAST |
workflow |
|
INTERPRO |
workflow |
|
makeGenBankFormat |
localworker |
Script/*
This Java code has an input 2 lists. One list with UniProt's IDs and one with sequences from transeq.
It connects to the website, gets the data and creates a GenBank file.
*/
import java.io.*;
//Stores Header of GenBank file into genbankfile string
String genbankfile = "LOCUS\nDEFINITION\n\nACCESSION\nVERSION\nKEYWORDS\nSOURSE\n ORGANISM\n\n\nREFERENCE\n AUTHORS\n TITLE\n\n JOURNAL\n\n PUBMED\n\n\nREFERENCE\n AUTHORS\n TITLE\n\nJOURNAL\n\n PUBMED\n\n\nREFERENCE\n AUTHORS\n TITLE\n\n FEATURES\n";
//Loop that goes through the list of IDs
//This is used to annotate every gene from Blast results
for (int i = 0; i < uniProtIDs.size(); i++) {
//seqin is the the second list that is used as an input
//It takes the "i"th sequence from the list
//It splits at every space and stores it into str2 array
String[] seqarray = aaSequences.get(i).split("[ ]+");
//It gets the second element of the array which is the start position of the gene
String inStartPos = seqarray[1];
//It gets the third element of the array which is the end position of the gene
String inEndPos = seqarray[2];
//It gets the forth element of the array which is the frame number holding the gene
String inFrameNumber = seqarray[3];
//It gets the fifth element of the array which is the actual Amino Acid sequence after it gets translated from transeq
String inSequence = seqarray[4];
//Variables
boolean seqfound = false;
String geneID = null;
String uniprotSequence = "";
String lineLength = "";
String geneName = "";
String organismName = "";
String uniprotFunction = "";
String geneOntologies = "";
String line = null;
boolean goChecker = true;
int functionChecker = 0;
List gos = new ArrayList();
//Gets the "i"th element from uniProtIDs list that contains uniprot identifiers and declares it as a URL
URL inputURL = new URL("http://www.uniprot.org/uniprot/"
+ uniProtIDs.get(i) + ".txt");
//Open a connection between our machine and the previous URL
URLConnection con = inputURL.openConnection();
//Stream the content of the website
InputStream in = con.getInputStream();
StringBuffer result = new StringBuffer();
BufferedReader reader;
//Get website's encoding
String encoding = con.getContentEncoding();
//Check if there is encoding in the website
//If there in not buffer the website
//but if there is buffer using the encoding that was found above
if (encoding == null) {
reader = new BufferedReader(new InputStreamReader(in));
} else {
reader = new BufferedReader(new InputStreamReader(in, encoding));
}
//Loop through the website's data
while ((line = reader.readLine()) != null) {
//Split each line at every space and semicolon
String[] uniProtDataLine = line.split("[ ;]+");
//Find ID
//Check first position of the array uniProtDataLine if it equals to ID
//If true assign the second element of the array to id
if (uniProtDataLine[0].equals("ID")) {
geneID = uniProtDataLine[1];
}
//Find Gene's name
//Check first position of the array uniProtDataLine if it equals to GN
//If true assign the second element of the array to geneName
if (uniProtDataLine[0].equals("GN")) {
geneName = uniProtDataLine[1];
//Remove = character
String[] arraygene = geneName.split("=");
geneName = arraygene[1];
}
//Find Organism's name
//Check first position of the array uniProtDataLine if it equals to OS
//If true assign the second element of the array to organismName
if (uniProtDataLine[0].equals("OS")) {
for (int i = 1; i < uniProtDataLine.length; i++) {
organismName = organismName + uniProtDataLine[i] + " ";
}
}
//Find gene's function
//Check first position of the array uniProtDataLine if it equals to CC
if (uniProtDataLine[0].equals("CC") && (functionChecker <= 1)) {
//Because UniProt's file format has "CC -!- FUNCTION:" before the declaration
//we have to use some check methods to get the function
if (uniProtDataLine[1].equals("-!-")) {
functionChecker++;
if (functionChecker == 0) {
for (int i = 3; i < uniProtDataLine.length; i++) {
if (uniProtDataLine[i].contains("--")) {
uniProtDataLine[i] = "";
functionChecker++;
}
uniprotFunction += uniProtDataLine[i] + " ";
}
}
}
if (!uniProtDataLine[1].equals("-!-") && (functionChecker == 1)) {
for (int i = 1; i < uniProtDataLine.length; i++) {
if (uniProtDataLine[i].contains("--")) {
uniProtDataLine[i] = "";
functionChecker++;
}
uniprotFunction += uniProtDataLine[i] + " ";
}
}
}
//Reset goChecker and geneOntologies variables because they might have data from previous runs
goChecker = true;
geneOntologies = "";
//Find Gene's ontologies
//Check first position of the array uniProtDataLine if it equals to DR and the second to GO
if (uniProtDataLine[0].equals("DR") && uniProtDataLine[1].contains("GO")) {
for (int i = 2; i < uniProtDataLine.length; i++) {
//Remove unwanted data
if (uniProtDataLine[i].contains("IEA")) {
goChecker = false;
}
if (goChecker == true) {
geneOntologies += uniProtDataLine[i] + " ";
}
}
}
if (!geneOntologies.equals("")) {
gos.add(geneOntologies);
}
//Find end of file
//Check first position of the array uniProtDataLine if it equals to //
if (uniProtDataLine[0].equals("//")) {
seqfound = false;
}
/*
* This code was actually implemented to get the protein's sequence from
* uniprot record. Wasn't used at the end because we got the sequence from transeq
if (seqfound == true) {
for (int i = 0; i < uniProtDataLine.length; i++) {
uniprotSequence += uniProtDataLine[i];
if (uniprotSequence.toString().length() % 58 == 0) {
}
}
uniprotSequence += "\n\t\t\t\t\t";
}
if (uniProtDataLine[0].equals("SQ")) {
seqfound = true;
}
}*/
//Stores information from the previous steps into genbankfile String
genbankfile += "\tgene\t\t";
genbankfile += inStartPos + "..." + inEndPos + "\n";
genbankfile += "\t\t\t/gene=\"" + geneName + "\"\n";
genbankfile += "\tCDS\t\t";
genbankfile += inStartPos + "..." + inEndPos + "\n";
genbankfile += "\t\t\t/gene=\"" + geneName + "\"\n";
genbankfile += "\t\t\t/function=\"" + uniprotFunction + "\"\n";
genbankfile += "\t\t\t/protein_id=\"" + geneID + "\n";
for (int k = 0; k < gos.size(); k++) {
genbankfile += "\t\t\t/db_xref=\"" + gos.get(k) + "\"\n";
}
genbankfile += "\t\t\t/codon_start=" + inFrameNumber + "\"\n";
String[] arrayline3 = inSequence.split("[= .]+");
genbankfile += "\t\t\t/translation=\"" + arrayline3[2] + "\"\n";
reader.close();
}
genbankfile += "\n\nORIGIN";
//Saves the genbank file to the given directory. This can be easy changed to you own path
FileWriter fstream = new FileWriter(
"/home/george/Desktop/EScience/GENBANKFILE/genbank.txt");
BufferedWriter out = new BufferedWriter(fstream);
out.write(genbankfile);
out.close();
contents = genbankfile;
} |
glimmerSplit |
externaltool |
|
SendToTranseq |
beanshell |
Script/*
* This Java code gets the output of glimmerspliter and stores each fasta sequence into a list
*/
import java.io.File;
import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.List;
import java.util.Scanner;
List fastaSeqList = new ArrayList();
Scanner sc = null;
//Gets the multifasta file
//The file is stored in /home/george/Desktop/Scripts/ but you can define your own path
try {
sc = new Scanner(
new File(
"/home/george/Desktop/Scripts/ordered_gi|49482253|ref|NC_002952.2|_g3.tfa"));
} catch (FileNotFoundException e) {
e.printStackTrace();
}
String file = "";
boolean guard = false;
int i = 0;
//Loops through the whole file
while (sc.hasNextLine()) {
String output = sc.nextLine();
//Check for > character that indicates a new fastA sequence
if (output.charAt(0) != '>') {
file += output + "\n";
guard = true;
} else {
//Add the sequence to fastaSeqList
if (guard) {
i++;
fastaSeqList.add(file);
file = "";
}
file += output + "\n";
}
}
fastaSeqList.add(file);
out = fastaSeqList; |
glimmer |
externaltool |
|
build_icm |
externaltool |
|
extract |
externaltool |
|
longorf |
externaltool |
|
UNIPROTTOKEGG |
workflow |
|
KEGG |
workflow |
|
LocalBlast |
workflow |
|
Beanshells (4)
Name |
Description |
Inputs |
Outputs |
SendToTranseq |
|
|
out
|
makelist |
|
list
|
out
|
remove_Nulls |
|
input
|
output
|
remove_nulls_2 |
|
input
|
output
|
Outputs (16)
Name |
Description |
BLAST_IDs |
|
BLAST_STATUS |
|
BLAST_GRAPHICAL |
|
BLAST_OUTPUT |
|
INTERPRO_XML |
|
INTERPRO_STATUS |
|
INTERPRO_GRAPHICAL |
|
INTERPRO_OUTPUT |
|
BLAST_FIRST_ID_LIST |
|
GENBANK_FILE |
|
KEGG_PATHWAY_DESC |
|
KEGG_PATHWAY_BY_GENE |
|
KEGG_IMAGE_URL |
|
KEGG_IMAGE |
|
KEGG_DESCRIPTIONS |
|
KEGG_ID |
|
Datalinks (25)
Source |
Sink |
SendToTranseq:out |
transeq:sequence_direct_data |
transeq:outseq |
BLAST:sequence |
email:value |
BLAST:email |
transeq:outseq |
INTERPRO:sequence |
email:value |
INTERPRO:email |
transeq:outseq |
makeGenBankFormat:seqin |
BLAST:list |
makeGenBankFormat:uniProtIDs |
BLAST:list |
UNIPROTTOKEGG:id |
UNIPROTTOKEGG:kegg_id |
KEGG:KeggID |
BLAST:getResult_2_output_output |
BLAST_IDs |
BLAST:getStatus_output_status |
BLAST_STATUS |
BLAST:getResult_graphic_output_output |
BLAST_GRAPHICAL |
BLAST:getResult_original_output |
BLAST_OUTPUT |
INTERPRO:getResult_3_output_output |
INTERPRO_XML |
INTERPRO:Workflow16_getStatus_output_status |
INTERPRO_STATUS |
INTERPRO:Graphical_output |
INTERPRO_GRAPHICAL |
INTERPRO:getResult_output_output |
INTERPRO_OUTPUT |
BLAST:list |
BLAST_FIRST_ID_LIST |
makeGenBankFormat:contents |
GENBANK_FILE |
KEGG:pathway_descriptions |
KEGG_PATHWAY_DESC |
KEGG:pathway_by_genes |
KEGG_PATHWAY_BY_GENE |
KEGG:URL |
KEGG_IMAGE_URL |
KEGG:image |
KEGG_IMAGE |
KEGG:kegg_descriptions |
KEGG_DESCRIPTIONS |
KEGG:KEGGID |
KEGG_ID |
Coordinations (7)
Controller |
Target |
transeq |
makeGenBankFormat |
extract |
build_icm |
BLAST |
makeGenBankFormat |
build_icm |
glimmer |
glimmerSplit |
SendToTranseq |
longorf |
extract |
glimmer |
glimmerSplit |
Uploader
License
All versions of this Workflow are
licensed under:
Version 1
(of 1)
Credits (2)
(People/Groups)
Attributions (4)
(Workflows/Files)
Shared with Groups (0)
None
Featured In Packs (0)
None
Log in to add to one of your Packs
Attributed By (0)
(Workflows/Files)
None
Favourited By (0)
No one
Statistics
Other workflows that use similar services
(109)
Only the first 2 workflows that use similar services are shown. View all workflows that use these services.
Comments (0)
No comments yet
Log in to make a comment