FINAL VERSION

None

/*
This Java code has an input 2 lists. One list with UniProt's IDs and one with sequences from transeq.
It connects to the website, gets the data and creates a GenBank file.
*/

import java.io.*;

		//Stores Header of GenBank file into genbankfile string
		String genbankfile = "LOCUS\nDEFINITION\n\nACCESSION\nVERSION\nKEYWORDS\nSOURSE\n  ORGANISM\n\n\nREFERENCE\n  AUTHORS\n  TITLE\n\n   JOURNAL\n\n   PUBMED\n\n\nREFERENCE\n  AUTHORS\n  TITLE\n\nJOURNAL\n\n   PUBMED\n\n\nREFERENCE\n  AUTHORS\n  TITLE\n\n FEATURES\n";
		//Loop that goes through the list of IDs
		//This is used to annotate every gene from Blast results
		for (int i = 0; i < uniProtIDs.size(); i++) {
			//seqin is the the second list that is used as an input
			//It takes the "i"th sequence from the list
			//It splits at every space and stores it into str2 array
			String[] seqarray = aaSequences.get(i).split("[ ]+");
			//It gets the second element of the array which is the start position of the gene
			String inStartPos = seqarray[1];
			//It gets the third element of the array which is the end position of the gene
			String inEndPos = seqarray[2];
			//It gets the forth element of the array which is the frame number holding the gene
			String inFrameNumber = seqarray[3];
			//It gets the fifth element of the array which is the actual Amino Acid sequence after it gets translated from transeq
			String inSequence = seqarray[4];

			//Variables
			boolean seqfound = false;
			String geneID = null;
			String uniprotSequence = "";
			String lineLength = "";
			String geneName = "";
			String organismName = "";
			String uniprotFunction = "";
			String geneOntologies = "";
			String line = null;
			boolean goChecker = true;
			int functionChecker = 0;
			List gos = new ArrayList();
			//Gets the "i"th element from uniProtIDs list that contains uniprot identifiers and declares it as a URL
			URL inputURL = new URL("http://www.uniprot.org/uniprot/"
					+ uniProtIDs.get(i) + ".txt");
			//Open a connection between our machine and the previous URL
			URLConnection con = inputURL.openConnection();
			//Stream the content of the website
			InputStream in = con.getInputStream();
			StringBuffer result = new StringBuffer();
			BufferedReader reader;
			//Get website's encoding
			String encoding = con.getContentEncoding();
			//Check if there is encoding in the website
			//If there in not buffer the website
			//but if there is buffer using the encoding that was found above
			if (encoding == null) {
				reader = new BufferedReader(new InputStreamReader(in));
			} else {
				reader = new BufferedReader(new InputStreamReader(in, encoding));
			}
			//Loop through the website's data
			while ((line = reader.readLine()) != null) {
				//Split each line at every space and semicolon
				String[] uniProtDataLine = line.split("[ ;]+");
				//Find ID
				//Check first position of the array uniProtDataLine if it equals to ID
				//If true assign the second element of the array to id
				if (uniProtDataLine[0].equals("ID")) {
					geneID = uniProtDataLine[1];

				}
				//Find Gene's name
				//Check first position of the array uniProtDataLine if it equals to GN
				//If true assign the second element of the array to geneName
				if (uniProtDataLine[0].equals("GN")) {
					geneName = uniProtDataLine[1];
					//Remove = character
					String[] arraygene = geneName.split("=");
					geneName = arraygene[1];

				}
				//Find Organism's name
				//Check first position of the array uniProtDataLine if it equals to OS
				//If true assign the second element of the array to organismName
				if (uniProtDataLine[0].equals("OS")) {
					for (int i = 1; i < uniProtDataLine.length; i++) {
						organismName = organismName + uniProtDataLine[i] + " ";
					}
				}
				//Find gene's function
				//Check first position of the array uniProtDataLine if it equals to CC			
				if (uniProtDataLine[0].equals("CC") && (functionChecker <= 1)) {
					//Because UniProt's file format has "CC   -!- FUNCTION:" before the declaration
					//we have to use some check methods to get the function
					if (uniProtDataLine[1].equals("-!-")) {
						functionChecker++;
						if (functionChecker == 0) {
							for (int i = 3; i < uniProtDataLine.length; i++) {
								if (uniProtDataLine[i].contains("--")) {
									uniProtDataLine[i] = "";
									functionChecker++;
								}
								uniprotFunction += uniProtDataLine[i] + " ";
							}
						}
					}
					if (!uniProtDataLine[1].equals("-!-") && (functionChecker == 1)) {

						for (int i = 1; i < uniProtDataLine.length; i++) {
							if (uniProtDataLine[i].contains("--")) {
								uniProtDataLine[i] = "";
								functionChecker++;
							}
							uniprotFunction += uniProtDataLine[i] + " ";
						}
					}
				}
				//Reset goChecker and geneOntologies variables because they might have data from previous runs
				goChecker = true;
				geneOntologies = "";
				//Find Gene's ontologies
				//Check first position of the array uniProtDataLine if it equals to DR and the second to GO
				if (uniProtDataLine[0].equals("DR") && uniProtDataLine[1].contains("GO")) {
					for (int i = 2; i < uniProtDataLine.length; i++) {
						//Remove unwanted data
						if (uniProtDataLine[i].contains("IEA")) {
							goChecker = false;
						}
						if (goChecker == true) {
							geneOntologies += uniProtDataLine[i] + " ";
						}
					}
				}
				if (!geneOntologies.equals("")) {
					gos.add(geneOntologies);
				}
				//Find end of file
				//Check first position of the array uniProtDataLine if it equals to //
				if (uniProtDataLine[0].equals("//")) {
					seqfound = false;
				}
				/*
				 * This code was actually implemented to get the protein's sequence from
				 * uniprot record. Wasn't used at the end because we got the sequence from transeq
				if (seqfound == true) {
					for (int i = 0; i < uniProtDataLine.length; i++) {
						uniprotSequence += uniProtDataLine[i];
						if (uniprotSequence.toString().length() % 58 == 0) {
						}
					}
					uniprotSequence += "\n\t\t\t\t\t";

				}
				if (uniProtDataLine[0].equals("SQ")) {
					seqfound = true;
				}
			}*/
			//Stores information from the previous steps into genbankfile String
			genbankfile += "\tgene\t\t";
			genbankfile += inStartPos + "..." + inEndPos + "\n";
			genbankfile += "\t\t\t/gene=\"" + geneName + "\"\n";
			genbankfile += "\tCDS\t\t";
			genbankfile += inStartPos + "..." + inEndPos + "\n";
			genbankfile += "\t\t\t/gene=\"" + geneName + "\"\n";
			genbankfile += "\t\t\t/function=\"" + uniprotFunction + "\"\n";
			genbankfile += "\t\t\t/protein_id=\"" + geneID + "\n";
			for (int k = 0; k < gos.size(); k++) {
				genbankfile += "\t\t\t/db_xref=\"" + gos.get(k) + "\"\n";
			}
			genbankfile += "\t\t\t/codon_start=" + inFrameNumber + "\"\n";
			String[] arrayline3 = inSequence.split("[= .]+");
			genbankfile += "\t\t\t/translation=\"" + arrayline3[2] + "\"\n";
			reader.close();
		}
		genbankfile += "\n\nORIGIN";
		//Saves the genbank file to the given directory. This can be easy changed to you own path
		FileWriter fstream = new FileWriter(
				"/home/george/Desktop/EScience/GENBANKFILE/genbank.txt");
		BufferedWriter out = new BufferedWriter(fstream);
		out.write(genbankfile);
		out.close();
		contents = genbankfile;
}

Name	Type	Description
transeq	soaplab	Endpoint http://www.ebi.ac.uk/soaplab/services/nucleic_translation.transeq
email	stringconstant	Value g.georgiou@newcastle.ac.uk
BLAST	workflow
INTERPRO	workflow
makeGenBankFormat	localworker	Script /* This Java code has an input 2 lists. One list with UniProt's IDs and one with sequences from transeq. It connects to the website, gets the data and creates a GenBank file. / import java.io.; //Stores Header of GenBank file into genbankfile string String genbankfile = "LOCUS\nDEFINITION\n\nACCESSION\nVERSION\nKEYWORDS\nSOURSE\n ORGANISM\n\n\nREFERENCE\n AUTHORS\n TITLE\n\n JOURNAL\n\n PUBMED\n\n\nREFERENCE\n AUTHORS\n TITLE\n\nJOURNAL\n\n PUBMED\n\n\nREFERENCE\n AUTHORS\n TITLE\n\n FEATURES\n"; //Loop that goes through the list of IDs //This is used to annotate every gene from Blast results for (int i = 0; i < uniProtIDs.size(); i++) { //seqin is the the second list that is used as an input //It takes the "i"th sequence from the list //It splits at every space and stores it into str2 array String[] seqarray = aaSequences.get(i).split("[ ]+"); //It gets the second element of the array which is the start position of the gene String inStartPos = seqarray[1]; //It gets the third element of the array which is the end position of the gene String inEndPos = seqarray[2]; //It gets the forth element of the array which is the frame number holding the gene String inFrameNumber = seqarray[3]; //It gets the fifth element of the array which is the actual Amino Acid sequence after it gets translated from transeq String inSequence = seqarray[4]; //Variables boolean seqfound = false; String geneID = null; String uniprotSequence = ""; String lineLength = ""; String geneName = ""; String organismName = ""; String uniprotFunction = ""; String geneOntologies = ""; String line = null; boolean goChecker = true; int functionChecker = 0; List gos = new ArrayList(); //Gets the "i"th element from uniProtIDs list that contains uniprot identifiers and declares it as a URL URL inputURL = new URL("http://www.uniprot.org/uniprot/" + uniProtIDs.get(i) + ".txt"); //Open a connection between our machine and the previous URL URLConnection con = inputURL.openConnection(); //Stream the content of the website InputStream in = con.getInputStream(); StringBuffer result = new StringBuffer(); BufferedReader reader; //Get website's encoding String encoding = con.getContentEncoding(); //Check if there is encoding in the website //If there in not buffer the website //but if there is buffer using the encoding that was found above if (encoding == null) { reader = new BufferedReader(new InputStreamReader(in)); } else { reader = new BufferedReader(new InputStreamReader(in, encoding)); } //Loop through the website's data while ((line = reader.readLine()) != null) { //Split each line at every space and semicolon String[] uniProtDataLine = line.split("[ ;]+"); //Find ID //Check first position of the array uniProtDataLine if it equals to ID //If true assign the second element of the array to id if (uniProtDataLine[0].equals("ID")) { geneID = uniProtDataLine[1]; } //Find Gene's name //Check first position of the array uniProtDataLine if it equals to GN //If true assign the second element of the array to geneName if (uniProtDataLine[0].equals("GN")) { geneName = uniProtDataLine[1]; //Remove = character String[] arraygene = geneName.split("="); geneName = arraygene[1]; } //Find Organism's name //Check first position of the array uniProtDataLine if it equals to OS //If true assign the second element of the array to organismName if (uniProtDataLine[0].equals("OS")) { for (int i = 1; i < uniProtDataLine.length; i++) { organismName = organismName + uniProtDataLine[i] + " "; } } //Find gene's function //Check first position of the array uniProtDataLine if it equals to CC if (uniProtDataLine[0].equals("CC") && (functionChecker <= 1)) { //Because UniProt's file format has "CC -!- FUNCTION:" before the declaration //we have to use some check methods to get the function if (uniProtDataLine[1].equals("-!-")) { functionChecker++; if (functionChecker == 0) { for (int i = 3; i < uniProtDataLine.length; i++) { if (uniProtDataLine[i].contains("--")) { uniProtDataLine[i] = ""; functionChecker++; } uniprotFunction += uniProtDataLine[i] + " "; } } } if (!uniProtDataLine[1].equals("-!-") && (functionChecker == 1)) { for (int i = 1; i < uniProtDataLine.length; i++) { if (uniProtDataLine[i].contains("--")) { uniProtDataLine[i] = ""; functionChecker++; } uniprotFunction += uniProtDataLine[i] + " "; } } } //Reset goChecker and geneOntologies variables because they might have data from previous runs goChecker = true; geneOntologies = ""; //Find Gene's ontologies //Check first position of the array uniProtDataLine if it equals to DR and the second to GO if (uniProtDataLine[0].equals("DR") && uniProtDataLine[1].contains("GO")) { for (int i = 2; i < uniProtDataLine.length; i++) { //Remove unwanted data if (uniProtDataLine[i].contains("IEA")) { goChecker = false; } if (goChecker == true) { geneOntologies += uniProtDataLine[i] + " "; } } } if (!geneOntologies.equals("")) { gos.add(geneOntologies); } //Find end of file //Check first position of the array uniProtDataLine if it equals to // if (uniProtDataLine[0].equals("//")) { seqfound = false; } /* * This code was actually implemented to get the protein's sequence from * uniprot record. Wasn't used at the end because we got the sequence from transeq if (seqfound == true) { for (int i = 0; i < uniProtDataLine.length; i++) { uniprotSequence += uniProtDataLine[i]; if (uniprotSequence.toString().length() % 58 == 0) { } } uniprotSequence += "\n\t\t\t\t\t"; } if (uniProtDataLine[0].equals("SQ")) { seqfound = true; } }*/ //Stores information from the previous steps into genbankfile String genbankfile += "\tgene\t\t"; genbankfile += inStartPos + "..." + inEndPos + "\n"; genbankfile += "\t\t\t/gene=\"" + geneName + "\"\n"; genbankfile += "\tCDS\t\t"; genbankfile += inStartPos + "..." + inEndPos + "\n"; genbankfile += "\t\t\t/gene=\"" + geneName + "\"\n"; genbankfile += "\t\t\t/function=\"" + uniprotFunction + "\"\n"; genbankfile += "\t\t\t/protein_id=\"" + geneID + "\n"; for (int k = 0; k < gos.size(); k++) { genbankfile += "\t\t\t/db_xref=\"" + gos.get(k) + "\"\n"; } genbankfile += "\t\t\t/codon_start=" + inFrameNumber + "\"\n"; String[] arrayline3 = inSequence.split("[= .]+"); genbankfile += "\t\t\t/translation=\"" + arrayline3[2] + "\"\n"; reader.close(); } genbankfile += "\n\nORIGIN"; //Saves the genbank file to the given directory. This can be easy changed to you own path FileWriter fstream = new FileWriter( "/home/george/Desktop/EScience/GENBANKFILE/genbank.txt"); BufferedWriter out = new BufferedWriter(fstream); out.write(genbankfile); out.close(); contents = genbankfile; }
glimmerSplit	externaltool
SendToTranseq	beanshell	Script /* * This Java code gets the output of glimmerspliter and stores each fasta sequence into a list */ import java.io.File; import java.io.FileNotFoundException; import java.util.ArrayList; import java.util.List; import java.util.Scanner; List fastaSeqList = new ArrayList(); Scanner sc = null; //Gets the multifasta file //The file is stored in /home/george/Desktop/Scripts/ but you can define your own path try { sc = new Scanner( new File( "/home/george/Desktop/Scripts/ordered_gi\|49482253\|ref\|NC_002952.2\|_g3.tfa")); } catch (FileNotFoundException e) { e.printStackTrace(); } String file = ""; boolean guard = false; int i = 0; //Loops through the whole file while (sc.hasNextLine()) { String output = sc.nextLine(); //Check for > character that indicates a new fastA sequence if (output.charAt(0) != '>') { file += output + "\n"; guard = true; } else { //Add the sequence to fastaSeqList if (guard) { i++; fastaSeqList.add(file); file = ""; } file += output + "\n"; } } fastaSeqList.add(file); out = fastaSeqList;
glimmer	externaltool
build_icm	externaltool
extract	externaltool
longorf	externaltool
UNIPROTTOKEGG	workflow
KEGG	workflow
LocalBlast	workflow

Name	Inputs	Outputs
SendToTranseq		out
makelist	list	out
remove_Nulls	input	output
remove_nulls_2	input	output

Name	Description
BLAST_IDs
BLAST_STATUS
BLAST_GRAPHICAL
BLAST_OUTPUT
INTERPRO_XML
INTERPRO_STATUS
INTERPRO_GRAPHICAL
INTERPRO_OUTPUT
BLAST_FIRST_ID_LIST
GENBANK_FILE
KEGG_PATHWAY_DESC
KEGG_PATHWAY_BY_GENE
KEGG_IMAGE_URL
KEGG_IMAGE
KEGG_DESCRIPTIONS
KEGG_ID

Source	Sink
SendToTranseq:out	transeq:sequence_direct_data
transeq:outseq	BLAST:sequence
email:value	BLAST:email
transeq:outseq	INTERPRO:sequence
email:value	INTERPRO:email
transeq:outseq	makeGenBankFormat:seqin
BLAST:list	makeGenBankFormat:uniProtIDs
BLAST:list	UNIPROTTOKEGG:id
UNIPROTTOKEGG:kegg_id	KEGG:KeggID
BLAST:getResult_2_output_output	BLAST_IDs
BLAST:getStatus_output_status	BLAST_STATUS
BLAST:getResult_graphic_output_output	BLAST_GRAPHICAL
BLAST:getResult_original_output	BLAST_OUTPUT
INTERPRO:getResult_3_output_output	INTERPRO_XML
INTERPRO:Workflow16_getStatus_output_status	INTERPRO_STATUS
INTERPRO:Graphical_output	INTERPRO_GRAPHICAL
INTERPRO:getResult_output_output	INTERPRO_OUTPUT
BLAST:list	BLAST_FIRST_ID_LIST
makeGenBankFormat:contents	GENBANK_FILE
KEGG:pathway_descriptions	KEGG_PATHWAY_DESC
KEGG:pathway_by_genes	KEGG_PATHWAY_BY_GENE
KEGG:URL	KEGG_IMAGE_URL
KEGG:image	KEGG_IMAGE
KEGG:kegg_descriptions	KEGG_DESCRIPTIONS
KEGG:KEGGID	KEGG_ID

Controller	Target
transeq	makeGenBankFormat
extract	build_icm
BLAST	makeGenBankFormat
build_icm	glimmer
glimmerSplit	SendToTranseq
longorf	extract
glimmer	glimmerSplit

FINAL VERSION

Preview

Run

Run this Workflow in the Taverna Workbench...

Workflow Components

Endpoint

Value

Script

Script

Reviews (0)

Comments (0)

Other workflows that use similar services (109)