Parsing of PAB and Pfam data

Yang QiDong
This commit is contained in:
yanfqidong0604 2019-01-07 09:51:43 +08:00
parent fb5557b62c
commit 53cb53f713
7 changed files with 990 additions and 0 deletions

View File

@ -0,0 +1,47 @@
{
"flow":{
"name":"test",
"uuid":"1234",
"stops":[
{
"uuid":"1111",
"name":"SelectFilesByName",
"bundle":"cn.piflow.bundle.ftp.SelectFilesByName",
"properties":{
"HDFSUrl":"hdfs://10.0.88.70:9000",
"HDFSPath":"/yqd/weishengwu/PDB/",
"selectionConditions":".*.ent.gz"
}
},{
"uuid":"2222",
"name":"UnzipFilesOnHDFS",
"bundle":"cn.piflow.bundle.http.UnzipFilesOnHDFS",
"properties":{
"isCustomize":"false",
"hdfsUrl":"hdfs://10.0.88.70:9000",
"filePath":"/yqd/weishengwu/PDB/",
"savePath":""
}
},{
"uuid":"3333",
"name":"PDBParser",
"bundle":"cn.piflow.bundle.microorganism.PDBParser",
"properties":{
}
}
],
"paths":[
{
"from":"SelectFilesByName",
"outport":"",
"inport":"",
"to":"UnzipFilesOnHDFS"
},{
"from":"UnzipFilesOnHDFS",
"outport":"",
"inport":"",
"to":"PDBParser"
}
]
}
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 13 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 6.0 KiB

View File

@ -0,0 +1,146 @@
package cn.piflow.bundle.microorganism
import java.io._
import cn.piflow.bundle.microorganism.util.PDB
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import cn.piflow.conf.{ConfigurableStop, PortEnum, StopGroup}
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.ImageUtil
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FSDataInputStream, FSDataOutputStream, FileSystem, Path}
import org.apache.log4j.Logger
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.biojavax.bio.seq.{RichSequence, RichSequenceIterator}
import org.json.JSONObject
class PDBParser extends ConfigurableStop{
override val authorEmail: String = "yangqidong@cnic.cn"
override val description: String = "Swissprot_TrEMBL type data"
override val inportList: List[String] =List(PortEnum.DefaultPort.toString)
override val outportList: List[String] = List(PortEnum.DefaultPort.toString)
override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
val session = pec.get[SparkSession]()
val inDf: DataFrame = in.read()
val configuration: Configuration = new Configuration()
var pathStr: String = ""
var hdfsUrl:String=""
try{
pathStr =inDf.take(1)(0).get(0).asInstanceOf[String]
val pathARR: Array[String] = pathStr.split("\\/")
for (x <- (0 until 3)){
hdfsUrl+=(pathARR(x) +"/")
}
}catch {
case e:Exception => throw new Exception("Path error")
}
configuration.set("fs.defaultFS",hdfsUrl)
var fs: FileSystem = FileSystem.get(configuration)
val hdfsPathTemporary:String = hdfsUrl+"/Refseq_genomeParser_temporary.json"
val path: Path = new Path(hdfsPathTemporary)
if(fs.exists(path)){
fs.delete(path)
}
fs.create(path).close()
var fdos: FSDataOutputStream = fs.append(path)
val buff: Array[Byte] = new Array[Byte](1048576)
var bis: BufferedInputStream =null
var fdis: FSDataInputStream =null
var br: BufferedReader = null
var sequences: RichSequenceIterator = null
var doc: JSONObject = null
var seq: RichSequence = null
var pdb: PDB = null
var jsonStr: String = ""
var n:Int=0
inDf.collect().foreach(row => {
pathStr = row.get(0).asInstanceOf[String]
println("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! start parser ^^^" + pathStr)
pdb = new PDB(pathStr,fs)
doc = pdb.getDoc
jsonStr = doc.toString
n +=1
println("start " + n + "String\\\n" /*+ jsonStr*/)
if (n == 1) {
bis = new BufferedInputStream(new ByteArrayInputStream(("[" + jsonStr).getBytes()))
} else {
bis = new BufferedInputStream(new ByteArrayInputStream(("," + jsonStr).getBytes()))
}
var count: Int = bis.read(buff)
while (count != -1) {
fdos.write(buff, 0, count)
fdos.flush()
count = bis.read(buff)
}
fdos.flush()
bis = null
doc = null
seq = null
jsonStr = ""
sequences = null
br = null
fdis =null
pathStr = null
pdb = null
})
bis = new BufferedInputStream(new ByteArrayInputStream(("]").getBytes()))
var count: Int = bis.read(buff)
while (count != -1) {
fdos.write(buff, 0, count)
fdos.flush()
count = bis.read(buff)
}
fdos.flush()
bis.close()
fdos.close()
// println("start parser HDFSjsonFile --------------------")
val df: DataFrame = session.read.json(hdfsPathTemporary)
println("############################################################")
// println(df.count())
df.show(20)
// df.printSchema()
println("############################################################")
out.write(df)
}
override def setProperties(map: Map[String, Any]): Unit = {
}
override def getPropertyDescriptor(): List[PropertyDescriptor] = {
var descriptor : List[PropertyDescriptor] = List()
descriptor
}
override def getIcon(): Array[Byte] = {
ImageUtil.getImage("/microorganism/pdb-logo.png")
}
override def getGroup(): List[String] = {
List(StopGroup.MicroorganismGroup)
}
override def initialize(ctx: ProcessContext): Unit = {
}
}

View File

@ -0,0 +1,148 @@
package cn.piflow.bundle.microorganism
import java.io.{BufferedInputStream, BufferedReader, ByteArrayInputStream, InputStreamReader}
import cn.piflow.bundle.microorganism.util.{CustomIOTools, Pfam, Process}
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import cn.piflow.conf.{ConfigurableStop, PortEnum, StopGroup}
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.ImageUtil
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FSDataInputStream, FSDataOutputStream, FileSystem, Path}
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.biojavax.bio.seq.{RichSequence, RichSequenceIterator}
import org.json.JSONObject
class PfamDataParser extends ConfigurableStop{
override val authorEmail: String = "yangqidong@cnic.cn"
override val description: String = "pfam type data"
override val inportList: List[String] =List(PortEnum.DefaultPort.toString)
override val outportList: List[String] = List(PortEnum.DefaultPort.toString)
override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
val session = pec.get[SparkSession]()
val inDf: DataFrame = in.read()
val configuration: Configuration = new Configuration()
var pathStr: String = ""
var hdfsUrl:String=""
try{
pathStr =inDf.take(1)(0).get(0).asInstanceOf[String]
val pathARR: Array[String] = pathStr.split("\\/")
for (x <- (0 until 3)){
hdfsUrl+=(pathARR(x) +"/")
}
}catch {
case e:Exception => throw new Exception("Path error")
}
configuration.set("fs.defaultFS",hdfsUrl)
var fs: FileSystem = FileSystem.get(configuration)
val hdfsPathTemporary:String = hdfsUrl+"/Refseq_genomeParser_temporary.json"
val path: Path = new Path(hdfsPathTemporary)
if(fs.exists(path)){
fs.delete(path)
}
fs.create(path).close()
var fdos: FSDataOutputStream = fs.append(path)
val buff: Array[Byte] = new Array[Byte](1048576)
var bis: BufferedInputStream =null
var fdis: FSDataInputStream =null
var br: BufferedReader = null
var sequences: RichSequenceIterator = null
var doc: JSONObject = null
var seq: RichSequence = null
var hasAnotherSequence : Boolean=true
var jsonStr: String = ""
var n:Int=0
inDf.collect().foreach(row => {
pathStr = row.get(0).asInstanceOf[String]
println("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! start parser ^^^" + pathStr)
fdis = fs.open(new Path(pathStr))
br = new BufferedReader(new InputStreamReader(fdis))
while( hasAnotherSequence && n < 1000 ){
n += 1
doc = new JSONObject()
hasAnotherSequence = Pfam.process(br,doc)
jsonStr = doc.toString
println("start " + n + "String\\\n" /*+ jsonStr*/)
if (n == 1) {
bis = new BufferedInputStream(new ByteArrayInputStream(("[" + jsonStr).getBytes()))
} else {
bis = new BufferedInputStream(new ByteArrayInputStream(("," + jsonStr).getBytes()))
}
var count: Int = bis.read(buff)
while (count != -1) {
fdos.write(buff, 0, count)
fdos.flush()
count = bis.read(buff)
}
fdos.flush()
bis = null
doc = null
seq = null
jsonStr = ""
}
sequences = null
br = null
fdis =null
pathStr = null
})
bis = new BufferedInputStream(new ByteArrayInputStream(("]").getBytes()))
var count: Int = bis.read(buff)
while (count != -1) {
fdos.write(buff, 0, count)
fdos.flush()
count = bis.read(buff)
}
fdos.flush()
bis.close()
fdos.close()
// println("start parser HDFSjsonFile --------------------")
val df: DataFrame = session.read.json(hdfsPathTemporary)
println("############################################################")
println(df.count())
df.show(20)
df.printSchema()
println("############################################################")
out.write(df)
}
override def setProperties(map: Map[String, Any]): Unit = {
}
override def getPropertyDescriptor(): List[PropertyDescriptor] ={
var descriptor : List[PropertyDescriptor] = List()
descriptor
}
override def getIcon(): Array[Byte] = {
ImageUtil.getImage("/microorganism/pfam.png")
}
override def getGroup(): List[String] = {
List(StopGroup.MicroorganismGroup)
}
override def initialize(ctx: ProcessContext): Unit = {
}
}

View File

@ -0,0 +1,301 @@
package cn.piflow.bundle.microorganism.util;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.Logger;
import org.biojava.nbio.structure.*;
import org.biojava.nbio.structure.io.PDBFileReader;
import org.json.JSONArray;
import org.json.JSONObject;
import java.io.*;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.zip.GZIPInputStream;
import org.apache.hadoop.fs.FileSystem;
public class PDB {
static final Logger logger = Logger.getLogger(PDB.class);
static final DateFormat formatter = new SimpleDateFormat("yyyy-MM-dd");
static final DateFormat pdbdateformatter = new SimpleDateFormat("dd-MMM-yy", Locale.US);
static final String NEWLINE = System.getProperty("line.separator");
private JSONObject doc;
private String pdbFilePath;
private FileSystem fs;
public PDB(String path,FileSystem f){
this.pdbFilePath = path;
this.doc = new JSONObject();
this.fs = f;
parsePDB();
}
public JSONObject getDoc(){
return this.doc;
}
public void parsePDB(){
//FileInputStream fileInputStream = null;
parsePDBBioJava();
parsePDBByLine();
}
private void parsePDBByLine(){
try{
// FileInputStream fileInputStream = new FileInputStream(pdbFilePath);
FSDataInputStream fis = fs.open(new Path(pdbFilePath));
// GZIPInputStream gzipout = new GZIPInputStream(fis);
BufferedReader br = new BufferedReader(new InputStreamReader(fis));
String line;
while ((line = br.readLine()) != null){
// ignore empty lines
if ( line.equals("") ||
(line.equals(NEWLINE))){
continue;
}
// ignore short TER and END lines
if ( (line.startsWith("TER")) ||
(line.startsWith("END"))) {
continue;
}
if ( line.length() < 6) {
logger.info("Found line length below 6. Ignoring it, line: >" + line +"<" );
continue;
}
String recordName = line.substring(0, 6).trim();
if(recordName.equals("HET")){
het_Handler(line);
}else if(recordName.equals("REVDAT")){
revdat_Handler(line);
}else if(recordName.equals("SEQRES")){
seqres_Handler(line);
}else if(recordName.equals("MODRES")){
modres_Handler(line);
}else if(recordName.equals("HETNAM")){
hetnam_Handler(line);
}else if(recordName.equals("HELIX")){
helix_Handler(line);
}else if(recordName.equals("MASTER")){
master_Handler(line);
}else if(recordName.equals("COMPND")){
continue;
}
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (ParseException e) {
logger.error("parsing error in processing: " + pdbFilePath);
logger.error(e.getMessage());
//e.printStackTrace();
}
}
//fields biojava processed
private void parsePDBBioJava(){
PDBFileReader pdbreader = new PDBFileReader();
//set parsing parameters
/*FileParsingParameters fileParsingParameters = new FileParsingParameters();
fileParsingParameters.setAlignSeqRes(true);
pdbreader.setFileParsingParameters(fileParsingParameters);*/
try{
//Structure struc = pdbreader.getStructure(pdbFilePath);
// FileInputStream fileInputStream = new FileInputStream(pdbFilePath);
FSDataInputStream fis = fs.open(new Path(pdbFilePath));
// GZIPInputStream gzipout = new GZIPInputStream(fis);
Structure struc = pdbreader.getStructure(fis);
PDBHeader pdbHeader = struc.getPDBHeader();
doc.put("DepositionDate", formatter.format(pdbHeader.getDepDate()));
String depositionYear = doc.getString("DepositionDate").substring(0, 4);
write_to_obj(doc, depositionYear, "DepositionYear");
doc.put("Classification", pdbHeader.getClassification());
doc.put("Idcode", pdbHeader.getIdCode());
doc.put("ReferenceTitle", pdbHeader.getTitle());
JSONArray compounds = new JSONArray();
for(Compound c : struc.getCompounds()){
JSONObject compound = new JSONObject();
compound.put("MOLID", c.getMolId());
compound.put("MoleculeName", c.getMolName());
String compound_Chain = "";
for(Chain chain : c.getChains()){
compound_Chain += ("," + chain.getChainID());
}
if(!compound_Chain.equals("")){
compound.put("Chain", compound_Chain.substring(1));
}
compound.put("Engineered", c.getEngineered());
compounds.put(compound);
}
doc.put("Compounds", compounds);
if(pdbHeader.getExperimentalTechniques() != null){
Iterator<ExperimentalTechnique> experimentalTechniqueIterator = pdbHeader.getExperimentalTechniques().iterator();
List<String> techniques = new ArrayList<String>();
while (experimentalTechniqueIterator.hasNext()){
techniques.add(experimentalTechniqueIterator.next().getName());
}
doc.put("Techiques", techniques);
}
doc.put("Author", pdbHeader.getAuthors());
JSONArray sites = new JSONArray();
for(Site site : struc.getSites()){
JSONObject siteObject = new JSONObject();
siteObject.put("SiteIdentifier", site.getSiteID());
siteObject.put("SiteDescription", site.getDescription());
sites.put(siteObject);
}
doc.put("sites", sites);
JSONArray dbRefs = new JSONArray();
for(DBRef dbRef : struc.getDBRefs()){
JSONObject dbRefObject = new JSONObject();
dbRefObject.put("ChainID", dbRef.getChainId());
dbRefObject.put("SeqBegin", dbRef.getSeqBegin());
dbRefObject.put("SeqEnd", dbRef.getSeqEnd());
dbRefObject.put("DbName", dbRef.getDatabase());
dbRefObject.put("DbAccession", dbRef.getDbAccession());
dbRefObject.put("DbSeqBegin", dbRef.getDbSeqBegin());
dbRefObject.put("DbSeqEnd", dbRef.getDbSeqEnd());
dbRefs.put(dbRefObject);
}
doc.put("dbRefs", dbRefs);
} catch (Exception e) {
logger.error("parsing error in processing: " + pdbFilePath);
logger.error(e.getMessage());
e.printStackTrace();
}
}
private void het_Handler(String line){
JSONObject hetObject = new JSONObject();
String hetID = line.substring(7, 10).trim();
String chainID = line.substring(12, 13);
String seqNum = line.substring(14, 17).trim();
String numHetAtoms = line.substring(21, 25).trim();
hetObject.put("HetID", hetID);
hetObject.put("ChainID", chainID);
hetObject.put("SeqNum", seqNum);
hetObject.put("NumHetAtoms", numHetAtoms);
write_to_doc("hets", hetObject);
}
private void revdat_Handler(String line) throws ParseException {
JSONObject revdatObject = new JSONObject();
String modNumber = line.substring(8, 10).trim();
write_to_obj(revdatObject, modNumber, "Modificationumber");
String modDateStr = line.substring (13, 22).trim();
if(!modDateStr.equals("")){
Date modDate = pdbdateformatter.parse(modDateStr);
revdatObject.put("ModificationDate", formatter.format(modDate));
}
write_to_doc("revdats", revdatObject);
}
private void seqres_Handler(String line){
JSONObject seqresObject = new JSONObject();
String chainID = line.substring(11, 12);
String numRes = line.substring(13,17).trim();
String acidSeq = line.substring(18).trim();
seqresObject.put("chainID", chainID);
write_to_obj(seqresObject, numRes, "NumRes");
seqresObject.put("AcidSeq", acidSeq);
write_to_doc("seqreses", seqresObject);
}
private void modres_Handler(String line){
JSONObject modresObject = new JSONObject();
// MODRES : MODRES 10MH 5NC C 427 DC 5-AZA-CYTIDINE-5'MONOPHOSPHATE
String resName = line.substring(12, 15).trim();
String chainID = line.substring(16, 17);
String seqNum = line.substring(18, 22).trim();
String stdRes = line.substring(23, 27).trim();
String modComment = line.substring(28).trim();
modresObject.put("ChainID", chainID);
modresObject.put("ResName", resName);
write_to_obj(modresObject, seqNum, "SeqNumber");
modresObject.put("StdRes", stdRes);
modresObject.put("ModificationComment", modComment);
write_to_doc("modreses", modresObject);
}
private void hetnam_Handler(String line){
JSONObject hetnamObject = new JSONObject();
String hetID = line.substring(10, 14).trim();
String chemicalName = line.substring(15).trim();
hetnamObject.put("HetID", hetID);
hetnamObject.put("ChemicalName", chemicalName);
write_to_doc("hetnams", hetnamObject);
}
private void helix_Handler(String line){
JSONObject helixObject = new JSONObject();
String helixID = line.substring(12, 14).trim();
if(line.trim().length() >= 40){
String helixClass = line.substring(39, 40);
write_to_obj(helixObject,helixClass,"HelixClass");
}
if (line.trim().length()>=76) {
String helixLength = line.substring(72, 76).trim();
write_to_obj(helixObject,helixLength, "HelixLength");
}
helixObject.put("HelixID", helixID);
write_to_doc("helixes", helixObject);
}
private void master_Handler(String line){
JSONObject masterObject = new JSONObject();
String numRemark = line.substring(11,15).trim();
String numHet = line.substring(21, 25).trim();
String numHelix = line.substring(26, 30).trim();
String numSheet = line.substring(31, 35).trim();
String numTurn = line.substring(36, 40).trim();
String numSite = line.substring(41, 45).trim();
String numXForm = line.substring(46, 50).trim();
String numCoord = line.substring(51, 55).trim();
String numtTer = line.substring(56, 60).trim();
String numConect = line.substring(61, 65).trim();
String numSeq = line.substring(66, 70).trim();
write_to_obj(masterObject, numRemark, "NumRemark");
write_to_obj(masterObject, numHet, "NumHet");
write_to_obj(masterObject, numHelix, "numHelix");
write_to_obj(masterObject, numSheet, "NumSheet");
write_to_obj(masterObject, numTurn, "NumTurn");
write_to_obj(masterObject, numSite, "NumSite");
write_to_obj(masterObject, numXForm, "NumXForm");
write_to_obj(masterObject, numCoord, "NumCoord");
write_to_obj(masterObject, numtTer, "NumtTer");
write_to_obj(masterObject, numConect, "NumConect");
write_to_obj(masterObject, numSeq, "NumSeq");
write_to_doc("masters", masterObject);
}
private static void write_to_obj(JSONObject obj, String str, String key){
int val;
try{
val = Integer.parseInt(str);
obj.put(key, val);
}catch (NumberFormatException fe){
logger.error("parse integer error with string: " + str);
}
}
private void write_to_doc(String array_name, JSONObject obj){
if(doc.optJSONArray(array_name) != null){
doc.getJSONArray(array_name).put(obj);
}else{
JSONArray arr = new JSONArray();
arr.put(obj);
doc.put(array_name, arr);
}
}
}

View File

@ -0,0 +1,348 @@
package cn.piflow.bundle.microorganism.util;
import org.biojava.bio.seq.io.ParseException;
import org.json.JSONArray;
import org.json.JSONObject;
import java.io.*;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Pfam {
//Compulsory fields
protected static final String IDENTIFICATION_TAG = "ID";
protected static final String ACCESSION_TAG = "AC";
protected static final String DEFINITION_TAG = "DE";
protected static final String AUTHOR_TAG = "AU";
protected static final String SEED_SOURCE_TAG = "SE";
protected static final String STRUCTURE_SOURCE_TAG = "SS";
protected static final String BUILD_METHOD_TAG = "BM";
protected static final String SEARCH_METHOD_TAG = "SM";
protected static final String GATHERING_THRESHOLD_TAG = "GA";
protected static final String TRUSTED_CUTOFF_TAG = "TC";
protected static final String NOISE_CUTOFF_TAG = "NC";
protected static final String TYPE_TAG = "TP";
protected static final String SEQUENCE_TAG = "SQ";
//Optional fields
protected static final String DATABASE_COMMENT_TAG = "DC";
protected static final String DATABASE_REFERENCE_TAG = "DR";
protected static final String REF_COMMENT_TAG = "RC";
protected static final String REF_NUMBER_TAG = "RN";
protected static final String REF_MEDLINE_TAG = "RM";
protected static final String REF_TITLE_TAG = "RT";
protected static final String REF_AUTHOR_TAG = "RA";
protected static final String REF_LOCATION_TAG = "RL";
protected static final String PRE_IDENTIFIER_TAG = "PI";
protected static final String KEYWORDS_TAG = "KW";
protected static final String COMMENT_TAG = "CC";
protected static final String PFAM_ACCESSION_TAG = "NE"; //INDICATES A NESTED DOMAIN
protected static final String LOCATION_TAG = "NL"; //location of nested domain-sequence ID,start and end of insert
protected static final String WIKI_LINK_TAG = "WK";
protected static final String CLAN_TAG = "CL"; //clan accession
protected static final String MEMBERSHIP_TAG = "MB";
protected static final String END_SEQUENCE_TAG = "//";
protected static final String STOCKHOLM_TAG = "# STOCKHOLM 1.0";
protected static final Pattern gsseqp = Pattern.compile("^(.+)/(\\d+)-(\\d+)\\s+([A-Z]{2})\\s+(.+)$");
protected static final Pattern seqp = Pattern.compile("^(\\S+)/(\\d+-\\d+)\\s+(.+)$");
protected static final Pattern gcp = Pattern.compile("^(\\S+)\\s+(.+)$");
private static boolean hasAnotherSequence = true;
public static boolean process(BufferedReader br, JSONObject doc) throws IOException {
String sectionKey = null;
String sectionVal = null;
JSONArray dbRefs = new JSONArray();
JSONArray gcLines = new JSONArray();
JSONArray rankedRefs = new JSONArray();
Map<String, JSONObject> sequences = new HashMap<String, JSONObject>();
try {
List section = null;
do{
section = readSection(br);
sectionKey = ((String[]) section.get(0))[0];
sectionVal = ((String[]) section.get(0))[1];
if(section.size() == 1 && sectionVal != null){
sectionVal = sectionVal.trim();
if(sectionVal.contains("\n")){
sectionVal = sectionVal.replaceAll("\\n", "");
}
if(sectionVal.endsWith(";")) sectionVal = sectionVal.substring(0, sectionVal.length()-1);
if(sectionKey.equals(IDENTIFICATION_TAG)){
doc.put("identification", sectionVal);
}else if(sectionKey.equals(ACCESSION_TAG)){
doc.put("accession", sectionVal);
}else if(sectionKey.equals(DEFINITION_TAG)){
doc.put("definition", sectionVal);
}else if(sectionKey.equals(AUTHOR_TAG)){
doc.put("author", sectionVal);
}else if(sectionKey.equals(GATHERING_THRESHOLD_TAG)){
doc.put("gathering_threshold", sectionVal);
}else if(sectionKey.equals(TRUSTED_CUTOFF_TAG)){
doc.put("trusted_cutoff", sectionVal);
}else if(sectionKey.equals(NOISE_CUTOFF_TAG)){
doc.put("noise_cutoff", sectionVal);
}else if(sectionKey.equals(TYPE_TAG)){
doc.put("type", sectionVal);
}else if(sectionKey.equals(SEQUENCE_TAG)){
doc.put("sequence_length", Integer.parseInt(sectionVal));
}else if(sectionKey.equals(SEED_SOURCE_TAG)){
doc.put("seed_source", sectionVal);
}else if(sectionKey.equals(STRUCTURE_SOURCE_TAG)){
doc.put("struc_source", sectionVal);
}else if(sectionKey.equals(BUILD_METHOD_TAG)){
doc.put("build_method", sectionVal);
}else if(sectionKey.equals(SEARCH_METHOD_TAG)){
doc.put("search_method", sectionVal);
}else if(sectionKey.equals(PRE_IDENTIFIER_TAG)){
doc.put("pre_identifier", sectionVal);
}else if(sectionKey.equals(KEYWORDS_TAG)){
doc.put("keywords", sectionVal);
}else if(sectionKey.equals(COMMENT_TAG)){
doc.put("comment", sectionVal);
}else if(sectionKey.equals(PFAM_ACCESSION_TAG)){
doc.put("pfam_accession", sectionVal);
}else if(sectionKey.equals(LOCATION_TAG)){
doc.put("location", sectionVal);
}else if(sectionKey.equals(WIKI_LINK_TAG)){
doc.put("wiki_link", sectionVal);
}else if(sectionKey.equals(CLAN_TAG)){
doc.put("clan", sectionVal);
}else if(sectionKey.equals(MEMBERSHIP_TAG)){
doc.put("membership", sectionVal);
}else if(sectionKey.equals(DATABASE_REFERENCE_TAG)){
JSONObject dbRef = new JSONObject();
String[] parts = sectionVal.split(";");
dbRef.put("databaseName", parts[0].trim());
dbRef.put("databaseID", parts[1].trim());
dbRefs.put(dbRef);
}else if(sectionKey.equals("STOCKHOLM")){
//do nothing
}else if(sectionKey.equals("GSMarkUp")){
JSONObject seq = new JSONObject();
Matcher m = gsseqp.matcher(sectionVal);
if(m.matches()){
String seq_name = m.group(1);
String seq_start = m.group(2);
String seq_end = m.group(3);
String feature_tag = m.group(4);
String feature_value = m.group(5);
seq.put("seq_name", seq_name);
seq.put("seq_start",seq_start);
seq.put("seq_end", seq_end);
seq.put(feature_tag, feature_value);
sequences.put(seq_name, seq);
}
}else if(sectionKey.equals("Sequence")){ // sequence line with no previous tag
Matcher m = seqp.matcher(sectionVal);
if(m.matches()){
String seq_name = m.group(1);
String sequence = m.group(3);
if(sequences.containsKey(seq_name)){
sequences.get(seq_name).put("sequence", sequence);
}
}
}else if(sectionKey.equals("GRMarkUp")){ // more info just below the sequence
Matcher m = gsseqp.matcher(sectionVal);
if(m.matches()){
String seq_name = m.group(1);
String feature_tag = m.group(4);
String feature_val = m.group(5);
if(sequences.containsKey(seq_name)){
sequences.get(seq_name).put(feature_tag, feature_val);
}
}
}else if(sectionKey.equals("GCMarkUp")){
Matcher m = gcp.matcher(sectionVal);
if(m.matches()){
JSONObject gcObj = new JSONObject();
String feature_tag = m.group(1);
String feature_val = m.group(2);
gcObj.put(feature_tag, feature_val);
gcLines.put(gcObj);
}
}else{
String name = sectionKey.toLowerCase();
doc.put(name, sectionVal);
}
} else if(section.size() > 1){
sectionKey = ((String[]) section.get(0))[0];
if(sectionKey.equals(REF_NUMBER_TAG)){ // this section is a reference
JSONObject refObj = new JSONObject();
String refRank = sectionVal.trim();
refRank = refRank.substring(1, refRank.length()-1);
int ref_rank = Integer.parseInt(refRank);
String medlineID = null;
String title = null;
String authors = null;
String location = null;
String comment = null;
for(int i = 1; i < section.size(); i++){
String key = ((String[])section.get(i))[0];
String val = ((String[])section.get(i))[1].trim();
if(val.contains("\n")){
val = val.replaceAll("\\n", "");
}
if (val.endsWith(";")) val = val.substring(0, val.length()-1);
if(key.equals(REF_MEDLINE_TAG)){medlineID = val;}
if(key.equals(REF_TITLE_TAG)){title = val;}
if(key.equals(REF_AUTHOR_TAG)){authors = val;}
if(key.equals(REF_LOCATION_TAG)){location = val;}
if(key.equals(REF_COMMENT_TAG)){
comment = val;
}
}
refObj.put("rank", ref_rank);
refObj.put("medlineID", Integer.parseInt(medlineID));
refObj.put("title", title);
refObj.put("authors", authors);
refObj.put("location", location);
refObj.put("comment", comment);
rankedRefs.put(refObj);
}else if(sectionKey.equals(DATABASE_REFERENCE_TAG)){
JSONObject dbRef = new JSONObject();
String[] parts = sectionVal.split(";");
dbRef.put("databaseName", parts[0].trim());
dbRef.put("databaseID", parts[1].trim());
StringBuffer comment = new StringBuffer();
for(int i = 1; i < section.size(); i++){
String key = ((String[])section.get(i))[0];
String val = ((String[])section.get(i))[1].trim();
if(key.equals(DATABASE_COMMENT_TAG)){
comment.append(val);
}
}
dbRef.put("comment", comment.toString());
dbRefs.put(dbRef);
}
}
}while (!sectionKey.equals(END_SEQUENCE_TAG));
doc.put("dbRefs", dbRefs);
doc.put("gcLines", gcLines);
JSONArray sequencesArr = new JSONArray();
for(Map.Entry<String, JSONObject> entry : sequences.entrySet()){
sequencesArr.put(entry.getValue());
}
// TODO: 2016/6/24 solve the outofmemory error as the sequences are too large, and we keep two copy right now. when put to doc, would get 3 copy.
doc.put("sequences", sequencesArr);
doc.put("rankedRefs", rankedRefs);
} catch (ParseException e) {
e.printStackTrace();
}
while (true){
br.mark(1);
int c = br.read();
if (c == -1) {
hasAnotherSequence = false;
break;
}
if (Character.isWhitespace((char) c)) {
continue;
}
br.reset();
break;
}
return hasAnotherSequence;
}
public static List readSection(BufferedReader br) throws ParseException {
List section = new ArrayList();
String line;
boolean done = false;
try {
while (!done) {
br.mark(160);
line = br.readLine();
if(line.equals(STOCKHOLM_TAG)){
done = true;
section.add(new String[]{"STOCKHOLM", null});
}else if(line.startsWith("#=GF")){
String token = line.substring(5,7);
if(token.equals(DEFINITION_TAG)){
section.add(new String[]{DEFINITION_TAG, line.substring(8)});
done = true;
}else if(token.charAt(0) == 'R' || token.charAt(0) == 'D'){
br.reset();
String currentTag = null;
char currentTagStart = '\0';
StringBuffer currentVal = null;
while(!done){
br.mark(160);
line = br.readLine();
if (currentTagStart=='\0') currentTagStart = line.charAt(5);
if (!line.startsWith("#=GF "+currentTagStart) ||
(currentTagStart=='R' && currentTag!=null && line.substring(5,7).equals(REF_NUMBER_TAG)) ||
(currentTagStart=='D' && currentTag!=null && line.substring(5,7).equals(DATABASE_REFERENCE_TAG))) {
br.reset();
done = true;
// dump current tag if exists
if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()});
} else {
try {
String tag = line.substring(5, 7);
String value = line.substring(8);
if (currentTag==null || !tag.equals(currentTag)) {
if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()});
currentTag = tag;
currentVal = new StringBuffer();
currentVal.append(value);
} else {
currentVal.append("\n");
currentVal.append(value);
}
} catch (Exception e) {
throw new ParseException(e);
}
}
}
}else { //values in multiple lines or single line
StringBuffer currentVal = new StringBuffer();
currentVal.append(line.substring(8).trim());
while (!done) {
br.mark(160);
line = br.readLine();
if(!line.substring(5,7).equals(token) || !line.startsWith("#=GF")){ // the following #=GS line might have "SQ" in the right position
br.reset();
done = true;
section.add(new String[]{token,currentVal.toString()});
}else{
currentVal.append("\n");
currentVal.append(line.substring(8).trim());
}
}
}
} else if(line.startsWith("#=GS")){
done = true;
section.add(new String[]{"GSMarkUp", line.substring(5).trim()});
} else if(line.startsWith("#=GR")){
done = true;
section.add(new String[]{"GRMarkUp", line.substring(5).trim()});
} else if(line.startsWith("#=GC")){
done = true;
section.add(new String[]{"GCMarkUp", line.substring(5).trim()});
} else if(line.startsWith(END_SEQUENCE_TAG)){
section.add(new String[]{END_SEQUENCE_TAG,null});
done = true;
} else{ //no prefix tag
done = true;
section.add(new String[]{"Sequence", line.trim()});
}
}
} catch (IOException e) {
e.printStackTrace();
}
return section;
}
}