Parsing of PAB and Pfam data

Yang QiDong
2019-01-07 09:51:43 +08:00 · 2019-01-07 09:51:43 +08:00 · 53cb53f713
parent fb5557b62c
commit 53cb53f713
7 changed files with 990 additions and 0 deletions
--- a/piflow-bundle/src/main/resources/microorganism/PDB.json
+++ b/piflow-bundle/src/main/resources/microorganism/PDB.json
@ -0,0 +1,47 @@
+{
+  "flow":{
+    "name":"test",
+    "uuid":"1234",
+    "stops":[
+     {
+        "uuid":"1111",
+        "name":"SelectFilesByName",
+        "bundle":"cn.piflow.bundle.ftp.SelectFilesByName",
+        "properties":{
+          "HDFSUrl":"hdfs://10.0.88.70:9000",
+          "HDFSPath":"/yqd/weishengwu/PDB/",
+          "selectionConditions":".*.ent.gz"
+        }
+      },{
+        "uuid":"2222",
+        "name":"UnzipFilesOnHDFS",
+        "bundle":"cn.piflow.bundle.http.UnzipFilesOnHDFS",
+        "properties":{
+          "isCustomize":"false",
+          "hdfsUrl":"hdfs://10.0.88.70:9000",
+          "filePath":"/yqd/weishengwu/PDB/",
+          "savePath":""
+        }
+      },{
+        "uuid":"3333",
+        "name":"PDBParser",
+        "bundle":"cn.piflow.bundle.microorganism.PDBParser",
+        "properties":{
+        }
+      }
+    ],
+    "paths":[
+      {
+        "from":"SelectFilesByName",
+        "outport":"",
+        "inport":"",
+        "to":"UnzipFilesOnHDFS"
+      },{
+        "from":"UnzipFilesOnHDFS",
+        "outport":"",
+        "inport":"",
+        "to":"PDBParser"
+      }
+    ]
+  }
+}
--- a/piflow-bundle/src/main/resources/microorganism/pdb-logo.png
+++ b/piflow-bundle/src/main/resources/microorganism/pdb-logo.png
--- a/piflow-bundle/src/main/resources/microorganism/pfam.png
+++ b/piflow-bundle/src/main/resources/microorganism/pfam.png
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/PDBParser.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/PDBParser.scala
@ -0,0 +1,146 @@
+package cn.piflow.bundle.microorganism
+
+import java.io._
+import cn.piflow.bundle.microorganism.util.PDB
+import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
+import cn.piflow.conf.{ConfigurableStop, PortEnum, StopGroup}
+import cn.piflow.conf.bean.PropertyDescriptor
+import cn.piflow.conf.util.ImageUtil
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{FSDataInputStream, FSDataOutputStream, FileSystem, Path}
+import org.apache.log4j.Logger
+import org.apache.spark.sql.{DataFrame, SparkSession}
+import org.biojavax.bio.seq.{RichSequence, RichSequenceIterator}
+import org.json.JSONObject
+
+class PDBParser extends ConfigurableStop{
+  override val authorEmail: String = "yangqidong@cnic.cn"
+  override val description: String = "Swissprot_TrEMBL type data"
+  override val inportList: List[String] =List(PortEnum.DefaultPort.toString)
+  override val outportList: List[String] = List(PortEnum.DefaultPort.toString)
+
+
+  override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
+
+    val session = pec.get[SparkSession]()
+
+    val inDf: DataFrame = in.read()
+    val configuration: Configuration = new Configuration()
+    var pathStr: String = ""
+    var hdfsUrl:String=""
+    try{
+      pathStr =inDf.take(1)(0).get(0).asInstanceOf[String]
+      val pathARR: Array[String] = pathStr.split("\\/")
+
+      for (x <- (0 until 3)){
+        hdfsUrl+=(pathARR(x) +"/")
+      }
+    }catch {
+      case e:Exception => throw new Exception("Path error")
+    }
+
+    configuration.set("fs.defaultFS",hdfsUrl)
+    var fs: FileSystem = FileSystem.get(configuration)
+
+    val hdfsPathTemporary:String = hdfsUrl+"/Refseq_genomeParser_temporary.json"
+    val path: Path = new Path(hdfsPathTemporary)
+
+    if(fs.exists(path)){
+      fs.delete(path)
+    }
+
+    fs.create(path).close()
+    var fdos: FSDataOutputStream = fs.append(path)
+    val buff: Array[Byte] = new Array[Byte](1048576)
+
+    var bis: BufferedInputStream =null
+    var fdis: FSDataInputStream =null
+    var br: BufferedReader = null
+    var sequences: RichSequenceIterator = null
+    var doc: JSONObject = null
+    var seq: RichSequence = null
+    var pdb: PDB = null
+    var jsonStr: String = ""
+    var n:Int=0
+    inDf.collect().foreach(row => {
+      pathStr = row.get(0).asInstanceOf[String]
+      println("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!   start parser ^^^" + pathStr)
+
+      pdb = new PDB(pathStr,fs)
+      doc = pdb.getDoc
+
+      jsonStr = doc.toString
+      n +=1
+      println("start " + n + "String\\\n" /*+ jsonStr*/)
+
+      if (n == 1) {
+        bis = new BufferedInputStream(new ByteArrayInputStream(("[" + jsonStr).getBytes()))
+      } else {
+        bis = new BufferedInputStream(new ByteArrayInputStream(("," + jsonStr).getBytes()))
+      }
+      var count: Int = bis.read(buff)
+      while (count != -1) {
+        fdos.write(buff, 0, count)
+        fdos.flush()
+        count = bis.read(buff)
+      }
+      fdos.flush()
+
+      bis = null
+      doc = null
+      seq = null
+      jsonStr = ""
+      sequences = null
+      br = null
+      fdis =null
+      pathStr = null
+      pdb = null
+    })
+    bis = new BufferedInputStream(new ByteArrayInputStream(("]").getBytes()))
+
+    var count: Int = bis.read(buff)
+    while (count != -1) {
+      fdos.write(buff, 0, count)
+      fdos.flush()
+      count = bis.read(buff)
+    }
+    fdos.flush()
+    bis.close()
+    fdos.close()
+
+    //    println("start parser HDFSjsonFile   --------------------")
+    val df: DataFrame = session.read.json(hdfsPathTemporary)
+
+    println("############################################################")
+//                println(df.count())
+    df.show(20)
+//            df.printSchema()
+    println("############################################################")
+    out.write(df)
+
+
+}
+
+
+  override def setProperties(map: Map[String, Any]): Unit = {
+
+  }
+
+  override def getPropertyDescriptor(): List[PropertyDescriptor] = {
+    var descriptor : List[PropertyDescriptor] = List()
+    descriptor
+  }
+
+  override def getIcon(): Array[Byte] = {
+    ImageUtil.getImage("/microorganism/pdb-logo.png")
+  }
+
+  override def getGroup(): List[String] = {
+    List(StopGroup.MicroorganismGroup)
+  }
+
+  override def initialize(ctx: ProcessContext): Unit = {
+
+  }
+
+}
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/PfamDataParser.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/PfamDataParser.scala
@ -0,0 +1,148 @@
+package cn.piflow.bundle.microorganism
+
+import java.io.{BufferedInputStream, BufferedReader, ByteArrayInputStream, InputStreamReader}
+
+import cn.piflow.bundle.microorganism.util.{CustomIOTools, Pfam, Process}
+import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
+import cn.piflow.conf.{ConfigurableStop, PortEnum, StopGroup}
+import cn.piflow.conf.bean.PropertyDescriptor
+import cn.piflow.conf.util.ImageUtil
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{FSDataInputStream, FSDataOutputStream, FileSystem, Path}
+import org.apache.spark.sql.{DataFrame, SparkSession}
+import org.biojavax.bio.seq.{RichSequence, RichSequenceIterator}
+import org.json.JSONObject
+
+class PfamDataParser extends ConfigurableStop{
+  override val authorEmail: String = "yangqidong@cnic.cn"
+  override val description: String = "pfam type data"
+  override val inportList: List[String] =List(PortEnum.DefaultPort.toString)
+  override val outportList: List[String] = List(PortEnum.DefaultPort.toString)
+
+
+  override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
+    val session = pec.get[SparkSession]()
+
+    val inDf: DataFrame = in.read()
+    val configuration: Configuration = new Configuration()
+    var pathStr: String = ""
+    var hdfsUrl:String=""
+    try{
+      pathStr =inDf.take(1)(0).get(0).asInstanceOf[String]
+      val pathARR: Array[String] = pathStr.split("\\/")
+
+      for (x <- (0 until 3)){
+        hdfsUrl+=(pathARR(x) +"/")
+      }
+    }catch {
+      case e:Exception => throw new Exception("Path error")
+    }
+
+    configuration.set("fs.defaultFS",hdfsUrl)
+    var fs: FileSystem = FileSystem.get(configuration)
+
+    val hdfsPathTemporary:String = hdfsUrl+"/Refseq_genomeParser_temporary.json"
+    val path: Path = new Path(hdfsPathTemporary)
+
+    if(fs.exists(path)){
+      fs.delete(path)
+    }
+
+    fs.create(path).close()
+    var fdos: FSDataOutputStream = fs.append(path)
+    val buff: Array[Byte] = new Array[Byte](1048576)
+
+    var bis: BufferedInputStream =null
+    var fdis: FSDataInputStream =null
+    var br: BufferedReader = null
+    var sequences: RichSequenceIterator = null
+    var doc: JSONObject = null
+    var seq: RichSequence = null
+    var hasAnotherSequence : Boolean=true
+    var jsonStr: String = ""
+    var n:Int=0
+    inDf.collect().foreach(row => {
+      pathStr = row.get(0).asInstanceOf[String]
+      println("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!   start parser ^^^" + pathStr)
+      fdis = fs.open(new Path(pathStr))
+      br = new BufferedReader(new InputStreamReader(fdis))
+
+      while( hasAnotherSequence  &&  n < 1000 ){
+        n += 1
+
+        doc = new JSONObject()
+        hasAnotherSequence = Pfam.process(br,doc)
+
+        jsonStr = doc.toString
+        println("start " + n + "String\\\n" /*+ jsonStr*/)
+
+        if (n == 1) {
+          bis = new BufferedInputStream(new ByteArrayInputStream(("[" + jsonStr).getBytes()))
+        } else {
+          bis = new BufferedInputStream(new ByteArrayInputStream(("," + jsonStr).getBytes()))
+        }
+        var count: Int = bis.read(buff)
+        while (count != -1) {
+          fdos.write(buff, 0, count)
+          fdos.flush()
+          count = bis.read(buff)
+        }
+        fdos.flush()
+
+        bis = null
+        doc = null
+        seq = null
+        jsonStr = ""
+      }
+
+      sequences = null
+      br = null
+      fdis =null
+      pathStr = null
+    })
+    bis = new BufferedInputStream(new ByteArrayInputStream(("]").getBytes()))
+
+    var count: Int = bis.read(buff)
+    while (count != -1) {
+      fdos.write(buff, 0, count)
+      fdos.flush()
+      count = bis.read(buff)
+    }
+    fdos.flush()
+    bis.close()
+    fdos.close()
+
+    //    println("start parser HDFSjsonFile   --------------------")
+    val df: DataFrame = session.read.json(hdfsPathTemporary)
+
+    println("############################################################")
+    println(df.count())
+    df.show(20)
+    df.printSchema()
+    println("############################################################")
+    out.write(df)
+  }
+
+
+  override def setProperties(map: Map[String, Any]): Unit = {
+
+  }
+
+  override def getPropertyDescriptor(): List[PropertyDescriptor] ={
+    var descriptor : List[PropertyDescriptor] = List()
+    descriptor
+  }
+
+  override def getIcon(): Array[Byte] = {
+    ImageUtil.getImage("/microorganism/pfam.png")
+  }
+
+  override def getGroup(): List[String] = {
+    List(StopGroup.MicroorganismGroup)
+  }
+
+  override def initialize(ctx: ProcessContext): Unit = {
+
+  }
+
+}
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/util/PDB.java
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/util/PDB.java
@ -0,0 +1,301 @@
+package cn.piflow.bundle.microorganism.util;
+
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.Path;
+import org.apache.log4j.Logger;
+import org.biojava.nbio.structure.*;
+import org.biojava.nbio.structure.io.PDBFileReader;
+import org.json.JSONArray;
+import org.json.JSONObject;
+
+import java.io.*;
+import java.text.DateFormat;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.util.*;
+import java.util.zip.GZIPInputStream;
+import org.apache.hadoop.fs.FileSystem;
+
+public class PDB {
+
+    static final Logger logger = Logger.getLogger(PDB.class);
+
+    static final DateFormat formatter = new SimpleDateFormat("yyyy-MM-dd");
+    static final DateFormat pdbdateformatter = new SimpleDateFormat("dd-MMM-yy", Locale.US);
+    static final String NEWLINE = System.getProperty("line.separator");
+    private JSONObject doc;
+    private String pdbFilePath;
+    private FileSystem fs;
+
+    public PDB(String path,FileSystem f){
+        this.pdbFilePath = path;
+        this.doc = new JSONObject();
+        this.fs = f;
+        parsePDB();
+    }
+
+    public JSONObject getDoc(){
+        return this.doc;
+    }
+
+    public void parsePDB(){
+        //FileInputStream fileInputStream = null;
+        parsePDBBioJava();
+        parsePDBByLine();
+    }
+
+    private void parsePDBByLine(){
+        try{
+//            FileInputStream fileInputStream = new FileInputStream(pdbFilePath);
+            FSDataInputStream fis = fs.open(new Path(pdbFilePath));
+//            GZIPInputStream gzipout = new GZIPInputStream(fis);
+            BufferedReader br = new BufferedReader(new InputStreamReader(fis));
+            String line;
+            while ((line = br.readLine()) != null){
+                // ignore empty lines
+                if ( line.equals("") ||
+                        (line.equals(NEWLINE))){
+                    continue;
+                }
+
+                // ignore short TER and END lines
+                if ( (line.startsWith("TER")) ||
+                        (line.startsWith("END"))) {
+                    continue;
+                }
+                if ( line.length() < 6) {
+                    logger.info("Found line length below 6. Ignoring it, line: >" + line +"<" );
+                    continue;
+                }
+                String recordName = line.substring(0, 6).trim();
+                if(recordName.equals("HET")){
+                    het_Handler(line);
+                }else if(recordName.equals("REVDAT")){
+                    revdat_Handler(line);
+                }else if(recordName.equals("SEQRES")){
+                    seqres_Handler(line);
+                }else if(recordName.equals("MODRES")){
+                    modres_Handler(line);
+                }else if(recordName.equals("HETNAM")){
+                    hetnam_Handler(line);
+                }else if(recordName.equals("HELIX")){
+                    helix_Handler(line);
+                }else if(recordName.equals("MASTER")){
+                    master_Handler(line);
+                }else if(recordName.equals("COMPND")){
+                    continue;
+                }
+            }
+        } catch (FileNotFoundException e) {
+            e.printStackTrace();
+        } catch (IOException e) {
+            e.printStackTrace();
+        } catch (ParseException e) {
+            logger.error("parsing error in processing: " + pdbFilePath);
+            logger.error(e.getMessage());
+            //e.printStackTrace();
+        }
+    }
+
+    //fields biojava processed
+    private void parsePDBBioJava(){
+        PDBFileReader pdbreader = new PDBFileReader();
+        //set parsing parameters
+        /*FileParsingParameters fileParsingParameters = new FileParsingParameters();
+        fileParsingParameters.setAlignSeqRes(true);
+        pdbreader.setFileParsingParameters(fileParsingParameters);*/
+        try{
+            //Structure struc = pdbreader.getStructure(pdbFilePath);
+//            FileInputStream fileInputStream = new FileInputStream(pdbFilePath);
+            FSDataInputStream fis = fs.open(new Path(pdbFilePath));
+//            GZIPInputStream gzipout = new GZIPInputStream(fis);
+            Structure struc = pdbreader.getStructure(fis);
+            PDBHeader pdbHeader = struc.getPDBHeader();
+            doc.put("DepositionDate", formatter.format(pdbHeader.getDepDate()));
+            String depositionYear = doc.getString("DepositionDate").substring(0, 4);
+            write_to_obj(doc, depositionYear, "DepositionYear");
+            doc.put("Classification", pdbHeader.getClassification());
+            doc.put("Idcode", pdbHeader.getIdCode());
+
+            doc.put("ReferenceTitle", pdbHeader.getTitle());
+            JSONArray compounds = new JSONArray();
+            for(Compound c : struc.getCompounds()){
+                JSONObject compound = new JSONObject();
+                compound.put("MOLID", c.getMolId());
+                compound.put("MoleculeName", c.getMolName());
+                String compound_Chain = "";
+                for(Chain chain : c.getChains()){
+                    compound_Chain += ("," + chain.getChainID());
+                }
+                if(!compound_Chain.equals("")){
+                    compound.put("Chain", compound_Chain.substring(1));
+                }
+                compound.put("Engineered", c.getEngineered());
+                compounds.put(compound);
+            }
+            doc.put("Compounds", compounds);
+
+            if(pdbHeader.getExperimentalTechniques() != null){
+                Iterator<ExperimentalTechnique> experimentalTechniqueIterator = pdbHeader.getExperimentalTechniques().iterator();
+                List<String> techniques = new ArrayList<String>();
+                while (experimentalTechniqueIterator.hasNext()){
+                    techniques.add(experimentalTechniqueIterator.next().getName());
+                }
+                doc.put("Techiques", techniques);
+            }
+
+            doc.put("Author", pdbHeader.getAuthors());
+
+            JSONArray sites = new JSONArray();
+            for(Site site : struc.getSites()){
+                JSONObject siteObject = new JSONObject();
+                siteObject.put("SiteIdentifier", site.getSiteID());
+                siteObject.put("SiteDescription", site.getDescription());
+                sites.put(siteObject);
+            }
+            doc.put("sites", sites);
+
+            JSONArray dbRefs = new JSONArray();
+            for(DBRef dbRef : struc.getDBRefs()){
+                JSONObject dbRefObject = new JSONObject();
+                dbRefObject.put("ChainID", dbRef.getChainId());
+                dbRefObject.put("SeqBegin", dbRef.getSeqBegin());
+                dbRefObject.put("SeqEnd", dbRef.getSeqEnd());
+                dbRefObject.put("DbName", dbRef.getDatabase());
+                dbRefObject.put("DbAccession", dbRef.getDbAccession());
+                dbRefObject.put("DbSeqBegin", dbRef.getDbSeqBegin());
+                dbRefObject.put("DbSeqEnd", dbRef.getDbSeqEnd());
+                dbRefs.put(dbRefObject);
+            }
+            doc.put("dbRefs", dbRefs);
+        } catch (Exception e) {
+            logger.error("parsing error in processing: " + pdbFilePath);
+            logger.error(e.getMessage());
+            e.printStackTrace();
+        }
+    }
+
+    private void het_Handler(String line){
+        JSONObject hetObject = new JSONObject();
+        String hetID = line.substring(7, 10).trim();
+        String chainID = line.substring(12, 13);
+        String seqNum = line.substring(14, 17).trim();
+        String numHetAtoms = line.substring(21, 25).trim();
+        hetObject.put("HetID", hetID);
+        hetObject.put("ChainID", chainID);
+        hetObject.put("SeqNum", seqNum);
+        hetObject.put("NumHetAtoms", numHetAtoms);
+        write_to_doc("hets", hetObject);
+    }
+
+    private void revdat_Handler(String line) throws ParseException {
+
+        JSONObject revdatObject = new JSONObject();
+        String modNumber = line.substring(8, 10).trim();
+        write_to_obj(revdatObject, modNumber, "Modificationumber");
+        String modDateStr = line.substring (13, 22).trim();
+        if(!modDateStr.equals("")){
+            Date modDate = pdbdateformatter.parse(modDateStr);
+            revdatObject.put("ModificationDate", formatter.format(modDate));
+        }
+        write_to_doc("revdats", revdatObject);
+    }
+
+    private void seqres_Handler(String line){
+        JSONObject seqresObject = new JSONObject();
+        String chainID    = line.substring(11, 12);
+        String numRes   = line.substring(13,17).trim();
+        String acidSeq = line.substring(18).trim();
+        seqresObject.put("chainID", chainID);
+        write_to_obj(seqresObject, numRes, "NumRes");
+        seqresObject.put("AcidSeq", acidSeq);
+        write_to_doc("seqreses", seqresObject);
+    }
+
+    private void modres_Handler(String line){
+        JSONObject modresObject = new JSONObject();
+        // MODRES : MODRES 10MH 5NC C  427   DC  5-AZA-CYTIDINE-5'MONOPHOSPHATE
+        String resName = line.substring(12, 15).trim();
+        String chainID = line.substring(16, 17);
+        String seqNum = line.substring(18, 22).trim();
+        String stdRes = line.substring(23, 27).trim();
+        String modComment = line.substring(28).trim();
+        modresObject.put("ChainID", chainID);
+        modresObject.put("ResName", resName);
+        write_to_obj(modresObject, seqNum, "SeqNumber");
+        modresObject.put("StdRes", stdRes);
+        modresObject.put("ModificationComment", modComment);
+        write_to_doc("modreses", modresObject);
+
+    }
+
+    private void hetnam_Handler(String line){
+        JSONObject hetnamObject = new JSONObject();
+        String hetID = line.substring(10, 14).trim();
+        String chemicalName = line.substring(15).trim();
+        hetnamObject.put("HetID", hetID);
+        hetnamObject.put("ChemicalName", chemicalName);
+        write_to_doc("hetnams", hetnamObject);
+    }
+
+    private void helix_Handler(String line){
+        JSONObject helixObject = new JSONObject();
+        String helixID = line.substring(12, 14).trim();
+        if(line.trim().length() >= 40){
+            String helixClass = line.substring(39, 40);
+            write_to_obj(helixObject,helixClass,"HelixClass");
+        }
+        if (line.trim().length()>=76) {
+            String helixLength = line.substring(72, 76).trim();
+            write_to_obj(helixObject,helixLength, "HelixLength");
+        }
+        helixObject.put("HelixID", helixID);
+        write_to_doc("helixes", helixObject);
+    }
+
+    private void master_Handler(String line){
+        JSONObject masterObject = new JSONObject();
+        String numRemark = line.substring(11,15).trim();
+        String numHet = line.substring(21, 25).trim();
+        String numHelix = line.substring(26, 30).trim();
+        String numSheet = line.substring(31, 35).trim();
+        String numTurn = line.substring(36, 40).trim();
+        String numSite = line.substring(41, 45).trim();
+        String numXForm = line.substring(46, 50).trim();
+        String numCoord = line.substring(51, 55).trim();
+        String numtTer = line.substring(56, 60).trim();
+        String numConect = line.substring(61, 65).trim();
+        String numSeq = line.substring(66, 70).trim();
+        write_to_obj(masterObject, numRemark, "NumRemark");
+        write_to_obj(masterObject, numHet, "NumHet");
+        write_to_obj(masterObject, numHelix, "numHelix");
+        write_to_obj(masterObject, numSheet, "NumSheet");
+        write_to_obj(masterObject, numTurn, "NumTurn");
+        write_to_obj(masterObject, numSite, "NumSite");
+        write_to_obj(masterObject, numXForm, "NumXForm");
+        write_to_obj(masterObject, numCoord, "NumCoord");
+        write_to_obj(masterObject, numtTer, "NumtTer");
+        write_to_obj(masterObject, numConect, "NumConect");
+        write_to_obj(masterObject, numSeq, "NumSeq");
+        write_to_doc("masters", masterObject);
+    }
+    private static void write_to_obj(JSONObject obj, String str, String key){
+        int val;
+        try{
+            val = Integer.parseInt(str);
+            obj.put(key, val);
+        }catch (NumberFormatException fe){
+            logger.error("parse integer error with string: " + str);
+        }
+    }
+
+    private void write_to_doc(String array_name, JSONObject obj){
+        if(doc.optJSONArray(array_name) != null){
+            doc.getJSONArray(array_name).put(obj);
+        }else{
+            JSONArray arr = new JSONArray();
+            arr.put(obj);
+            doc.put(array_name, arr);
+        }
+    }
+}
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/util/Pfam.java
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/util/Pfam.java
@ -0,0 +1,348 @@
+package cn.piflow.bundle.microorganism.util;
+
+
+import org.biojava.bio.seq.io.ParseException;
+import org.json.JSONArray;
+import org.json.JSONObject;
+import java.io.*;
+import java.util.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+
+public class Pfam {
+
+    //Compulsory fields
+    protected static final String IDENTIFICATION_TAG = "ID";
+    protected static final String ACCESSION_TAG = "AC";
+    protected static final String DEFINITION_TAG = "DE";
+    protected static final String AUTHOR_TAG = "AU";
+    protected static final String SEED_SOURCE_TAG = "SE";
+    protected static final String STRUCTURE_SOURCE_TAG = "SS";
+    protected static final String BUILD_METHOD_TAG = "BM";
+    protected static final String SEARCH_METHOD_TAG = "SM";
+    protected static final String GATHERING_THRESHOLD_TAG = "GA";
+    protected static final String TRUSTED_CUTOFF_TAG = "TC";
+    protected static final String NOISE_CUTOFF_TAG = "NC";
+    protected static final String TYPE_TAG = "TP";
+    protected static final String SEQUENCE_TAG = "SQ";
+    //Optional fields
+    protected static final String DATABASE_COMMENT_TAG = "DC";
+    protected static final String DATABASE_REFERENCE_TAG = "DR";
+    protected static final String REF_COMMENT_TAG = "RC";
+    protected static final String REF_NUMBER_TAG = "RN";
+    protected static final String REF_MEDLINE_TAG = "RM";
+    protected static final String REF_TITLE_TAG = "RT";
+    protected static final String REF_AUTHOR_TAG = "RA";
+    protected static final String REF_LOCATION_TAG = "RL";
+    protected static final String PRE_IDENTIFIER_TAG = "PI";
+    protected static final String KEYWORDS_TAG = "KW";
+    protected static final String COMMENT_TAG = "CC";
+    protected static final String PFAM_ACCESSION_TAG = "NE";  //INDICATES A NESTED DOMAIN
+    protected static final String LOCATION_TAG = "NL";  //location of nested domain-sequence ID,start and end of insert
+    protected static final String WIKI_LINK_TAG = "WK";
+    protected static final String CLAN_TAG = "CL";    //clan accession
+    protected static final String MEMBERSHIP_TAG = "MB";
+
+    protected static final String END_SEQUENCE_TAG = "//";
+
+    protected static final String STOCKHOLM_TAG = "# STOCKHOLM 1.0";
+
+    protected static final Pattern gsseqp = Pattern.compile("^(.+)/(\\d+)-(\\d+)\\s+([A-Z]{2})\\s+(.+)$");
+    protected static final Pattern seqp = Pattern.compile("^(\\S+)/(\\d+-\\d+)\\s+(.+)$");
+    protected static final Pattern gcp = Pattern.compile("^(\\S+)\\s+(.+)$");
+
+    private static boolean hasAnotherSequence = true;
+
+    public static boolean process(BufferedReader br, JSONObject doc) throws IOException {
+        String sectionKey = null;
+        String sectionVal = null;
+        JSONArray dbRefs = new JSONArray();
+        JSONArray gcLines = new JSONArray();
+        JSONArray rankedRefs = new JSONArray();
+        Map<String, JSONObject> sequences = new HashMap<String, JSONObject>();
+        try {
+            List section = null;
+            do{
+                section = readSection(br);
+                sectionKey = ((String[]) section.get(0))[0];
+                sectionVal = ((String[]) section.get(0))[1];
+                if(section.size() == 1 && sectionVal != null){
+                    sectionVal = sectionVal.trim();
+                    if(sectionVal.contains("\n")){
+                        sectionVal = sectionVal.replaceAll("\\n", "");
+                    }
+                    if(sectionVal.endsWith(";")) sectionVal = sectionVal.substring(0, sectionVal.length()-1);
+
+                    if(sectionKey.equals(IDENTIFICATION_TAG)){
+                        doc.put("identification", sectionVal);
+                    }else if(sectionKey.equals(ACCESSION_TAG)){
+                        doc.put("accession", sectionVal);
+                    }else if(sectionKey.equals(DEFINITION_TAG)){
+                        doc.put("definition", sectionVal);
+                    }else if(sectionKey.equals(AUTHOR_TAG)){
+                        doc.put("author", sectionVal);
+                    }else if(sectionKey.equals(GATHERING_THRESHOLD_TAG)){
+                        doc.put("gathering_threshold", sectionVal);
+                    }else if(sectionKey.equals(TRUSTED_CUTOFF_TAG)){
+                        doc.put("trusted_cutoff", sectionVal);
+                    }else if(sectionKey.equals(NOISE_CUTOFF_TAG)){
+                        doc.put("noise_cutoff", sectionVal);
+                    }else if(sectionKey.equals(TYPE_TAG)){
+                        doc.put("type", sectionVal);
+                    }else if(sectionKey.equals(SEQUENCE_TAG)){
+                        doc.put("sequence_length", Integer.parseInt(sectionVal));
+                    }else if(sectionKey.equals(SEED_SOURCE_TAG)){
+                        doc.put("seed_source", sectionVal);
+                    }else if(sectionKey.equals(STRUCTURE_SOURCE_TAG)){
+                        doc.put("struc_source", sectionVal);
+                    }else if(sectionKey.equals(BUILD_METHOD_TAG)){
+                        doc.put("build_method", sectionVal);
+                    }else if(sectionKey.equals(SEARCH_METHOD_TAG)){
+                        doc.put("search_method", sectionVal);
+                    }else if(sectionKey.equals(PRE_IDENTIFIER_TAG)){
+                        doc.put("pre_identifier", sectionVal);
+                    }else if(sectionKey.equals(KEYWORDS_TAG)){
+                        doc.put("keywords", sectionVal);
+                    }else if(sectionKey.equals(COMMENT_TAG)){
+                        doc.put("comment", sectionVal);
+                    }else if(sectionKey.equals(PFAM_ACCESSION_TAG)){
+                        doc.put("pfam_accession", sectionVal);
+                    }else if(sectionKey.equals(LOCATION_TAG)){
+                        doc.put("location", sectionVal);
+                    }else if(sectionKey.equals(WIKI_LINK_TAG)){
+                        doc.put("wiki_link", sectionVal);
+                    }else if(sectionKey.equals(CLAN_TAG)){
+                        doc.put("clan", sectionVal);
+                    }else if(sectionKey.equals(MEMBERSHIP_TAG)){
+                        doc.put("membership", sectionVal);
+                    }else if(sectionKey.equals(DATABASE_REFERENCE_TAG)){
+                        JSONObject dbRef = new JSONObject();
+                        String[] parts = sectionVal.split(";");
+                        dbRef.put("databaseName", parts[0].trim());
+                        dbRef.put("databaseID", parts[1].trim());
+                        dbRefs.put(dbRef);
+                    }else if(sectionKey.equals("STOCKHOLM")){
+                        //do nothing
+                    }else if(sectionKey.equals("GSMarkUp")){
+                        JSONObject seq = new JSONObject();
+                        Matcher m = gsseqp.matcher(sectionVal);
+                        if(m.matches()){
+                            String seq_name = m.group(1);
+                            String seq_start = m.group(2);
+                            String seq_end = m.group(3);
+                            String feature_tag = m.group(4);
+                            String feature_value = m.group(5);
+                            seq.put("seq_name", seq_name);
+                            seq.put("seq_start",seq_start);
+                            seq.put("seq_end", seq_end);
+                            seq.put(feature_tag, feature_value);
+                            sequences.put(seq_name, seq);
+                        }
+                    }else if(sectionKey.equals("Sequence")){  // sequence line with no previous tag
+                        Matcher m = seqp.matcher(sectionVal);
+                        if(m.matches()){
+                            String seq_name = m.group(1);
+                            String sequence = m.group(3);
+                            if(sequences.containsKey(seq_name)){
+                                sequences.get(seq_name).put("sequence", sequence);
+                            }
+                        }
+                    }else if(sectionKey.equals("GRMarkUp")){  // more info just below the sequence
+                        Matcher m = gsseqp.matcher(sectionVal);
+                        if(m.matches()){
+                            String seq_name = m.group(1);
+                            String feature_tag = m.group(4);
+                            String feature_val = m.group(5);
+                            if(sequences.containsKey(seq_name)){
+                                sequences.get(seq_name).put(feature_tag, feature_val);
+                            }
+                        }
+                    }else if(sectionKey.equals("GCMarkUp")){
+                        Matcher m = gcp.matcher(sectionVal);
+                        if(m.matches()){
+                            JSONObject gcObj = new JSONObject();
+                            String feature_tag = m.group(1);
+                            String feature_val = m.group(2);
+                            gcObj.put(feature_tag, feature_val);
+                            gcLines.put(gcObj);
+                        }
+                    }else{
+                        String name = sectionKey.toLowerCase();
+                        doc.put(name, sectionVal);
+                    }
+                } else if(section.size() > 1){
+                    sectionKey = ((String[]) section.get(0))[0];
+                    if(sectionKey.equals(REF_NUMBER_TAG)){   // this section is a reference
+                        JSONObject refObj = new JSONObject();
+                        String refRank = sectionVal.trim();
+                        refRank = refRank.substring(1, refRank.length()-1);
+                        int ref_rank = Integer.parseInt(refRank);
+                        String medlineID = null;
+                        String title = null;
+                        String authors = null;
+                        String location = null;
+                        String comment = null;
+                        for(int i = 1; i < section.size(); i++){
+                            String key = ((String[])section.get(i))[0];
+                            String val = ((String[])section.get(i))[1].trim();
+                            if(val.contains("\n")){
+                                val = val.replaceAll("\\n", "");
+                            }
+                            if (val.endsWith(";")) val = val.substring(0, val.length()-1);
+                            if(key.equals(REF_MEDLINE_TAG)){medlineID = val;}
+                            if(key.equals(REF_TITLE_TAG)){title = val;}
+                            if(key.equals(REF_AUTHOR_TAG)){authors = val;}
+                            if(key.equals(REF_LOCATION_TAG)){location = val;}
+                            if(key.equals(REF_COMMENT_TAG)){
+                                comment = val;
+                            }
+                        }
+                        refObj.put("rank", ref_rank);
+                        refObj.put("medlineID", Integer.parseInt(medlineID));
+                        refObj.put("title", title);
+                        refObj.put("authors", authors);
+                        refObj.put("location", location);
+                        refObj.put("comment", comment);
+                        rankedRefs.put(refObj);
+                    }else if(sectionKey.equals(DATABASE_REFERENCE_TAG)){
+                        JSONObject dbRef = new JSONObject();
+                        String[] parts = sectionVal.split(";");
+                        dbRef.put("databaseName", parts[0].trim());
+                        dbRef.put("databaseID", parts[1].trim());
+
+                        StringBuffer comment = new StringBuffer();
+                        for(int i = 1; i < section.size(); i++){
+                            String key = ((String[])section.get(i))[0];
+                            String val = ((String[])section.get(i))[1].trim();
+                            if(key.equals(DATABASE_COMMENT_TAG)){
+                                comment.append(val);
+                            }
+                        }
+                        dbRef.put("comment", comment.toString());
+                        dbRefs.put(dbRef);
+                    }
+                }
+            }while (!sectionKey.equals(END_SEQUENCE_TAG));
+            doc.put("dbRefs", dbRefs);
+            doc.put("gcLines", gcLines);
+            JSONArray sequencesArr = new JSONArray();
+            for(Map.Entry<String, JSONObject> entry : sequences.entrySet()){
+                sequencesArr.put(entry.getValue());
+            }
+            // TODO: 2016/6/24 solve the outofmemory error as the sequences are too large, and we keep two copy right now. when put to doc, would get 3 copy.
+            doc.put("sequences", sequencesArr);
+            doc.put("rankedRefs", rankedRefs);
+        }  catch (ParseException e) {
+            e.printStackTrace();
+        }
+
+        while (true){
+            br.mark(1);
+            int c = br.read();
+            if (c == -1) {
+                hasAnotherSequence = false;
+                break;
+            }
+            if (Character.isWhitespace((char) c)) {
+                continue;
+            }
+            br.reset();
+            break;
+        }
+        return hasAnotherSequence;
+    }
+
+    public static List readSection(BufferedReader br) throws ParseException {
+        List section = new ArrayList();
+        String line;
+        boolean done = false;
+
+        try {
+            while (!done) {
+                br.mark(160);
+                line = br.readLine();
+
+                if(line.equals(STOCKHOLM_TAG)){
+                    done = true;
+                    section.add(new String[]{"STOCKHOLM", null});
+                }else if(line.startsWith("#=GF")){
+                    String token = line.substring(5,7);
+                    if(token.equals(DEFINITION_TAG)){
+                        section.add(new String[]{DEFINITION_TAG, line.substring(8)});
+                        done = true;
+                    }else if(token.charAt(0) == 'R' || token.charAt(0) == 'D'){
+                        br.reset();
+                        String currentTag = null;
+                        char currentTagStart = '\0';
+                        StringBuffer currentVal = null;
+
+                        while(!done){
+                            br.mark(160);
+                            line = br.readLine();
+
+                            if (currentTagStart=='\0') currentTagStart = line.charAt(5);
+                            if (!line.startsWith("#=GF "+currentTagStart) ||
+                                    (currentTagStart=='R' && currentTag!=null && line.substring(5,7).equals(REF_NUMBER_TAG)) ||
+                                    (currentTagStart=='D' && currentTag!=null && line.substring(5,7).equals(DATABASE_REFERENCE_TAG))) {
+                                br.reset();
+                                done = true;
+                                // dump current tag if exists
+                                if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()});
+                            } else {
+                                try {
+                                    String tag = line.substring(5, 7);
+                                    String value = line.substring(8);
+                                    if (currentTag==null || !tag.equals(currentTag)) {
+                                        if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()});
+                                        currentTag = tag;
+                                        currentVal = new StringBuffer();
+                                        currentVal.append(value);
+                                    } else {
+                                        currentVal.append("\n");
+                                        currentVal.append(value);
+                                    }
+                                } catch (Exception e) {
+                                    throw new ParseException(e);
+                                }
+                            }
+                        }
+                    }else {  //values in multiple lines or single line
+                        StringBuffer currentVal = new StringBuffer();
+                        currentVal.append(line.substring(8).trim());
+                        while (!done) {
+                            br.mark(160);
+                            line = br.readLine();
+                            if(!line.substring(5,7).equals(token) || !line.startsWith("#=GF")){  // the following  #=GS line might have "SQ" in the right position
+                                br.reset();
+                                done = true;
+                                section.add(new String[]{token,currentVal.toString()});
+                            }else{
+                                currentVal.append("\n");
+                                currentVal.append(line.substring(8).trim());
+                            }
+                        }
+                    }
+                } else if(line.startsWith("#=GS")){
+                    done = true;
+                    section.add(new String[]{"GSMarkUp", line.substring(5).trim()});
+                } else if(line.startsWith("#=GR")){
+                    done = true;
+                    section.add(new String[]{"GRMarkUp", line.substring(5).trim()});
+                } else if(line.startsWith("#=GC")){
+                    done = true;
+                    section.add(new String[]{"GCMarkUp", line.substring(5).trim()});
+                } else if(line.startsWith(END_SEQUENCE_TAG)){
+                    section.add(new String[]{END_SEQUENCE_TAG,null});
+                    done = true;
+                } else{  //no prefix tag
+                    done = true;
+                    section.add(new String[]{"Sequence", line.trim()});
+                }
+            }
+        } catch (IOException e) {
+            e.printStackTrace();
+        }
+        return section;
+    }
+}