forked from opensci/piflow
parent
fb5557b62c
commit
53cb53f713
|
@ -0,0 +1,47 @@
|
|||
{
|
||||
"flow":{
|
||||
"name":"test",
|
||||
"uuid":"1234",
|
||||
"stops":[
|
||||
{
|
||||
"uuid":"1111",
|
||||
"name":"SelectFilesByName",
|
||||
"bundle":"cn.piflow.bundle.ftp.SelectFilesByName",
|
||||
"properties":{
|
||||
"HDFSUrl":"hdfs://10.0.88.70:9000",
|
||||
"HDFSPath":"/yqd/weishengwu/PDB/",
|
||||
"selectionConditions":".*.ent.gz"
|
||||
}
|
||||
},{
|
||||
"uuid":"2222",
|
||||
"name":"UnzipFilesOnHDFS",
|
||||
"bundle":"cn.piflow.bundle.http.UnzipFilesOnHDFS",
|
||||
"properties":{
|
||||
"isCustomize":"false",
|
||||
"hdfsUrl":"hdfs://10.0.88.70:9000",
|
||||
"filePath":"/yqd/weishengwu/PDB/",
|
||||
"savePath":""
|
||||
}
|
||||
},{
|
||||
"uuid":"3333",
|
||||
"name":"PDBParser",
|
||||
"bundle":"cn.piflow.bundle.microorganism.PDBParser",
|
||||
"properties":{
|
||||
}
|
||||
}
|
||||
],
|
||||
"paths":[
|
||||
{
|
||||
"from":"SelectFilesByName",
|
||||
"outport":"",
|
||||
"inport":"",
|
||||
"to":"UnzipFilesOnHDFS"
|
||||
},{
|
||||
"from":"UnzipFilesOnHDFS",
|
||||
"outport":"",
|
||||
"inport":"",
|
||||
"to":"PDBParser"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
Binary file not shown.
After Width: | Height: | Size: 13 KiB |
Binary file not shown.
After Width: | Height: | Size: 6.0 KiB |
|
@ -0,0 +1,146 @@
|
|||
package cn.piflow.bundle.microorganism
|
||||
|
||||
import java.io._
|
||||
import cn.piflow.bundle.microorganism.util.PDB
|
||||
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
|
||||
import cn.piflow.conf.{ConfigurableStop, PortEnum, StopGroup}
|
||||
import cn.piflow.conf.bean.PropertyDescriptor
|
||||
import cn.piflow.conf.util.ImageUtil
|
||||
import org.apache.hadoop.conf.Configuration
|
||||
import org.apache.hadoop.fs.{FSDataInputStream, FSDataOutputStream, FileSystem, Path}
|
||||
import org.apache.log4j.Logger
|
||||
import org.apache.spark.sql.{DataFrame, SparkSession}
|
||||
import org.biojavax.bio.seq.{RichSequence, RichSequenceIterator}
|
||||
import org.json.JSONObject
|
||||
|
||||
class PDBParser extends ConfigurableStop{
|
||||
override val authorEmail: String = "yangqidong@cnic.cn"
|
||||
override val description: String = "Swissprot_TrEMBL type data"
|
||||
override val inportList: List[String] =List(PortEnum.DefaultPort.toString)
|
||||
override val outportList: List[String] = List(PortEnum.DefaultPort.toString)
|
||||
|
||||
|
||||
override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
|
||||
|
||||
val session = pec.get[SparkSession]()
|
||||
|
||||
val inDf: DataFrame = in.read()
|
||||
val configuration: Configuration = new Configuration()
|
||||
var pathStr: String = ""
|
||||
var hdfsUrl:String=""
|
||||
try{
|
||||
pathStr =inDf.take(1)(0).get(0).asInstanceOf[String]
|
||||
val pathARR: Array[String] = pathStr.split("\\/")
|
||||
|
||||
for (x <- (0 until 3)){
|
||||
hdfsUrl+=(pathARR(x) +"/")
|
||||
}
|
||||
}catch {
|
||||
case e:Exception => throw new Exception("Path error")
|
||||
}
|
||||
|
||||
configuration.set("fs.defaultFS",hdfsUrl)
|
||||
var fs: FileSystem = FileSystem.get(configuration)
|
||||
|
||||
val hdfsPathTemporary:String = hdfsUrl+"/Refseq_genomeParser_temporary.json"
|
||||
val path: Path = new Path(hdfsPathTemporary)
|
||||
|
||||
if(fs.exists(path)){
|
||||
fs.delete(path)
|
||||
}
|
||||
|
||||
fs.create(path).close()
|
||||
var fdos: FSDataOutputStream = fs.append(path)
|
||||
val buff: Array[Byte] = new Array[Byte](1048576)
|
||||
|
||||
var bis: BufferedInputStream =null
|
||||
var fdis: FSDataInputStream =null
|
||||
var br: BufferedReader = null
|
||||
var sequences: RichSequenceIterator = null
|
||||
var doc: JSONObject = null
|
||||
var seq: RichSequence = null
|
||||
var pdb: PDB = null
|
||||
var jsonStr: String = ""
|
||||
var n:Int=0
|
||||
inDf.collect().foreach(row => {
|
||||
pathStr = row.get(0).asInstanceOf[String]
|
||||
println("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! start parser ^^^" + pathStr)
|
||||
|
||||
pdb = new PDB(pathStr,fs)
|
||||
doc = pdb.getDoc
|
||||
|
||||
jsonStr = doc.toString
|
||||
n +=1
|
||||
println("start " + n + "String\\\n" /*+ jsonStr*/)
|
||||
|
||||
if (n == 1) {
|
||||
bis = new BufferedInputStream(new ByteArrayInputStream(("[" + jsonStr).getBytes()))
|
||||
} else {
|
||||
bis = new BufferedInputStream(new ByteArrayInputStream(("," + jsonStr).getBytes()))
|
||||
}
|
||||
var count: Int = bis.read(buff)
|
||||
while (count != -1) {
|
||||
fdos.write(buff, 0, count)
|
||||
fdos.flush()
|
||||
count = bis.read(buff)
|
||||
}
|
||||
fdos.flush()
|
||||
|
||||
bis = null
|
||||
doc = null
|
||||
seq = null
|
||||
jsonStr = ""
|
||||
sequences = null
|
||||
br = null
|
||||
fdis =null
|
||||
pathStr = null
|
||||
pdb = null
|
||||
})
|
||||
bis = new BufferedInputStream(new ByteArrayInputStream(("]").getBytes()))
|
||||
|
||||
var count: Int = bis.read(buff)
|
||||
while (count != -1) {
|
||||
fdos.write(buff, 0, count)
|
||||
fdos.flush()
|
||||
count = bis.read(buff)
|
||||
}
|
||||
fdos.flush()
|
||||
bis.close()
|
||||
fdos.close()
|
||||
|
||||
// println("start parser HDFSjsonFile --------------------")
|
||||
val df: DataFrame = session.read.json(hdfsPathTemporary)
|
||||
|
||||
println("############################################################")
|
||||
// println(df.count())
|
||||
df.show(20)
|
||||
// df.printSchema()
|
||||
println("############################################################")
|
||||
out.write(df)
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
override def setProperties(map: Map[String, Any]): Unit = {
|
||||
|
||||
}
|
||||
|
||||
override def getPropertyDescriptor(): List[PropertyDescriptor] = {
|
||||
var descriptor : List[PropertyDescriptor] = List()
|
||||
descriptor
|
||||
}
|
||||
|
||||
override def getIcon(): Array[Byte] = {
|
||||
ImageUtil.getImage("/microorganism/pdb-logo.png")
|
||||
}
|
||||
|
||||
override def getGroup(): List[String] = {
|
||||
List(StopGroup.MicroorganismGroup)
|
||||
}
|
||||
|
||||
override def initialize(ctx: ProcessContext): Unit = {
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,148 @@
|
|||
package cn.piflow.bundle.microorganism
|
||||
|
||||
import java.io.{BufferedInputStream, BufferedReader, ByteArrayInputStream, InputStreamReader}
|
||||
|
||||
import cn.piflow.bundle.microorganism.util.{CustomIOTools, Pfam, Process}
|
||||
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
|
||||
import cn.piflow.conf.{ConfigurableStop, PortEnum, StopGroup}
|
||||
import cn.piflow.conf.bean.PropertyDescriptor
|
||||
import cn.piflow.conf.util.ImageUtil
|
||||
import org.apache.hadoop.conf.Configuration
|
||||
import org.apache.hadoop.fs.{FSDataInputStream, FSDataOutputStream, FileSystem, Path}
|
||||
import org.apache.spark.sql.{DataFrame, SparkSession}
|
||||
import org.biojavax.bio.seq.{RichSequence, RichSequenceIterator}
|
||||
import org.json.JSONObject
|
||||
|
||||
class PfamDataParser extends ConfigurableStop{
|
||||
override val authorEmail: String = "yangqidong@cnic.cn"
|
||||
override val description: String = "pfam type data"
|
||||
override val inportList: List[String] =List(PortEnum.DefaultPort.toString)
|
||||
override val outportList: List[String] = List(PortEnum.DefaultPort.toString)
|
||||
|
||||
|
||||
override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
|
||||
val session = pec.get[SparkSession]()
|
||||
|
||||
val inDf: DataFrame = in.read()
|
||||
val configuration: Configuration = new Configuration()
|
||||
var pathStr: String = ""
|
||||
var hdfsUrl:String=""
|
||||
try{
|
||||
pathStr =inDf.take(1)(0).get(0).asInstanceOf[String]
|
||||
val pathARR: Array[String] = pathStr.split("\\/")
|
||||
|
||||
for (x <- (0 until 3)){
|
||||
hdfsUrl+=(pathARR(x) +"/")
|
||||
}
|
||||
}catch {
|
||||
case e:Exception => throw new Exception("Path error")
|
||||
}
|
||||
|
||||
configuration.set("fs.defaultFS",hdfsUrl)
|
||||
var fs: FileSystem = FileSystem.get(configuration)
|
||||
|
||||
val hdfsPathTemporary:String = hdfsUrl+"/Refseq_genomeParser_temporary.json"
|
||||
val path: Path = new Path(hdfsPathTemporary)
|
||||
|
||||
if(fs.exists(path)){
|
||||
fs.delete(path)
|
||||
}
|
||||
|
||||
fs.create(path).close()
|
||||
var fdos: FSDataOutputStream = fs.append(path)
|
||||
val buff: Array[Byte] = new Array[Byte](1048576)
|
||||
|
||||
var bis: BufferedInputStream =null
|
||||
var fdis: FSDataInputStream =null
|
||||
var br: BufferedReader = null
|
||||
var sequences: RichSequenceIterator = null
|
||||
var doc: JSONObject = null
|
||||
var seq: RichSequence = null
|
||||
var hasAnotherSequence : Boolean=true
|
||||
var jsonStr: String = ""
|
||||
var n:Int=0
|
||||
inDf.collect().foreach(row => {
|
||||
pathStr = row.get(0).asInstanceOf[String]
|
||||
println("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! start parser ^^^" + pathStr)
|
||||
fdis = fs.open(new Path(pathStr))
|
||||
br = new BufferedReader(new InputStreamReader(fdis))
|
||||
|
||||
while( hasAnotherSequence && n < 1000 ){
|
||||
n += 1
|
||||
|
||||
doc = new JSONObject()
|
||||
hasAnotherSequence = Pfam.process(br,doc)
|
||||
|
||||
jsonStr = doc.toString
|
||||
println("start " + n + "String\\\n" /*+ jsonStr*/)
|
||||
|
||||
if (n == 1) {
|
||||
bis = new BufferedInputStream(new ByteArrayInputStream(("[" + jsonStr).getBytes()))
|
||||
} else {
|
||||
bis = new BufferedInputStream(new ByteArrayInputStream(("," + jsonStr).getBytes()))
|
||||
}
|
||||
var count: Int = bis.read(buff)
|
||||
while (count != -1) {
|
||||
fdos.write(buff, 0, count)
|
||||
fdos.flush()
|
||||
count = bis.read(buff)
|
||||
}
|
||||
fdos.flush()
|
||||
|
||||
bis = null
|
||||
doc = null
|
||||
seq = null
|
||||
jsonStr = ""
|
||||
}
|
||||
|
||||
sequences = null
|
||||
br = null
|
||||
fdis =null
|
||||
pathStr = null
|
||||
})
|
||||
bis = new BufferedInputStream(new ByteArrayInputStream(("]").getBytes()))
|
||||
|
||||
var count: Int = bis.read(buff)
|
||||
while (count != -1) {
|
||||
fdos.write(buff, 0, count)
|
||||
fdos.flush()
|
||||
count = bis.read(buff)
|
||||
}
|
||||
fdos.flush()
|
||||
bis.close()
|
||||
fdos.close()
|
||||
|
||||
// println("start parser HDFSjsonFile --------------------")
|
||||
val df: DataFrame = session.read.json(hdfsPathTemporary)
|
||||
|
||||
println("############################################################")
|
||||
println(df.count())
|
||||
df.show(20)
|
||||
df.printSchema()
|
||||
println("############################################################")
|
||||
out.write(df)
|
||||
}
|
||||
|
||||
|
||||
override def setProperties(map: Map[String, Any]): Unit = {
|
||||
|
||||
}
|
||||
|
||||
override def getPropertyDescriptor(): List[PropertyDescriptor] ={
|
||||
var descriptor : List[PropertyDescriptor] = List()
|
||||
descriptor
|
||||
}
|
||||
|
||||
override def getIcon(): Array[Byte] = {
|
||||
ImageUtil.getImage("/microorganism/pfam.png")
|
||||
}
|
||||
|
||||
override def getGroup(): List[String] = {
|
||||
List(StopGroup.MicroorganismGroup)
|
||||
}
|
||||
|
||||
override def initialize(ctx: ProcessContext): Unit = {
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,301 @@
|
|||
package cn.piflow.bundle.microorganism.util;
|
||||
|
||||
import org.apache.hadoop.fs.FSDataInputStream;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.biojava.nbio.structure.*;
|
||||
import org.biojava.nbio.structure.io.PDBFileReader;
|
||||
import org.json.JSONArray;
|
||||
import org.json.JSONObject;
|
||||
|
||||
import java.io.*;
|
||||
import java.text.DateFormat;
|
||||
import java.text.ParseException;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.*;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
|
||||
public class PDB {
|
||||
|
||||
static final Logger logger = Logger.getLogger(PDB.class);
|
||||
|
||||
static final DateFormat formatter = new SimpleDateFormat("yyyy-MM-dd");
|
||||
static final DateFormat pdbdateformatter = new SimpleDateFormat("dd-MMM-yy", Locale.US);
|
||||
static final String NEWLINE = System.getProperty("line.separator");
|
||||
private JSONObject doc;
|
||||
private String pdbFilePath;
|
||||
private FileSystem fs;
|
||||
|
||||
public PDB(String path,FileSystem f){
|
||||
this.pdbFilePath = path;
|
||||
this.doc = new JSONObject();
|
||||
this.fs = f;
|
||||
parsePDB();
|
||||
}
|
||||
|
||||
public JSONObject getDoc(){
|
||||
return this.doc;
|
||||
}
|
||||
|
||||
public void parsePDB(){
|
||||
//FileInputStream fileInputStream = null;
|
||||
parsePDBBioJava();
|
||||
parsePDBByLine();
|
||||
}
|
||||
|
||||
private void parsePDBByLine(){
|
||||
try{
|
||||
// FileInputStream fileInputStream = new FileInputStream(pdbFilePath);
|
||||
FSDataInputStream fis = fs.open(new Path(pdbFilePath));
|
||||
// GZIPInputStream gzipout = new GZIPInputStream(fis);
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(fis));
|
||||
String line;
|
||||
while ((line = br.readLine()) != null){
|
||||
// ignore empty lines
|
||||
if ( line.equals("") ||
|
||||
(line.equals(NEWLINE))){
|
||||
continue;
|
||||
}
|
||||
|
||||
// ignore short TER and END lines
|
||||
if ( (line.startsWith("TER")) ||
|
||||
(line.startsWith("END"))) {
|
||||
continue;
|
||||
}
|
||||
if ( line.length() < 6) {
|
||||
logger.info("Found line length below 6. Ignoring it, line: >" + line +"<" );
|
||||
continue;
|
||||
}
|
||||
String recordName = line.substring(0, 6).trim();
|
||||
if(recordName.equals("HET")){
|
||||
het_Handler(line);
|
||||
}else if(recordName.equals("REVDAT")){
|
||||
revdat_Handler(line);
|
||||
}else if(recordName.equals("SEQRES")){
|
||||
seqres_Handler(line);
|
||||
}else if(recordName.equals("MODRES")){
|
||||
modres_Handler(line);
|
||||
}else if(recordName.equals("HETNAM")){
|
||||
hetnam_Handler(line);
|
||||
}else if(recordName.equals("HELIX")){
|
||||
helix_Handler(line);
|
||||
}else if(recordName.equals("MASTER")){
|
||||
master_Handler(line);
|
||||
}else if(recordName.equals("COMPND")){
|
||||
continue;
|
||||
}
|
||||
}
|
||||
} catch (FileNotFoundException e) {
|
||||
e.printStackTrace();
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
} catch (ParseException e) {
|
||||
logger.error("parsing error in processing: " + pdbFilePath);
|
||||
logger.error(e.getMessage());
|
||||
//e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
//fields biojava processed
|
||||
private void parsePDBBioJava(){
|
||||
PDBFileReader pdbreader = new PDBFileReader();
|
||||
//set parsing parameters
|
||||
/*FileParsingParameters fileParsingParameters = new FileParsingParameters();
|
||||
fileParsingParameters.setAlignSeqRes(true);
|
||||
pdbreader.setFileParsingParameters(fileParsingParameters);*/
|
||||
try{
|
||||
//Structure struc = pdbreader.getStructure(pdbFilePath);
|
||||
// FileInputStream fileInputStream = new FileInputStream(pdbFilePath);
|
||||
FSDataInputStream fis = fs.open(new Path(pdbFilePath));
|
||||
// GZIPInputStream gzipout = new GZIPInputStream(fis);
|
||||
Structure struc = pdbreader.getStructure(fis);
|
||||
PDBHeader pdbHeader = struc.getPDBHeader();
|
||||
doc.put("DepositionDate", formatter.format(pdbHeader.getDepDate()));
|
||||
String depositionYear = doc.getString("DepositionDate").substring(0, 4);
|
||||
write_to_obj(doc, depositionYear, "DepositionYear");
|
||||
doc.put("Classification", pdbHeader.getClassification());
|
||||
doc.put("Idcode", pdbHeader.getIdCode());
|
||||
|
||||
doc.put("ReferenceTitle", pdbHeader.getTitle());
|
||||
JSONArray compounds = new JSONArray();
|
||||
for(Compound c : struc.getCompounds()){
|
||||
JSONObject compound = new JSONObject();
|
||||
compound.put("MOLID", c.getMolId());
|
||||
compound.put("MoleculeName", c.getMolName());
|
||||
String compound_Chain = "";
|
||||
for(Chain chain : c.getChains()){
|
||||
compound_Chain += ("," + chain.getChainID());
|
||||
}
|
||||
if(!compound_Chain.equals("")){
|
||||
compound.put("Chain", compound_Chain.substring(1));
|
||||
}
|
||||
compound.put("Engineered", c.getEngineered());
|
||||
compounds.put(compound);
|
||||
}
|
||||
doc.put("Compounds", compounds);
|
||||
|
||||
if(pdbHeader.getExperimentalTechniques() != null){
|
||||
Iterator<ExperimentalTechnique> experimentalTechniqueIterator = pdbHeader.getExperimentalTechniques().iterator();
|
||||
List<String> techniques = new ArrayList<String>();
|
||||
while (experimentalTechniqueIterator.hasNext()){
|
||||
techniques.add(experimentalTechniqueIterator.next().getName());
|
||||
}
|
||||
doc.put("Techiques", techniques);
|
||||
}
|
||||
|
||||
doc.put("Author", pdbHeader.getAuthors());
|
||||
|
||||
JSONArray sites = new JSONArray();
|
||||
for(Site site : struc.getSites()){
|
||||
JSONObject siteObject = new JSONObject();
|
||||
siteObject.put("SiteIdentifier", site.getSiteID());
|
||||
siteObject.put("SiteDescription", site.getDescription());
|
||||
sites.put(siteObject);
|
||||
}
|
||||
doc.put("sites", sites);
|
||||
|
||||
JSONArray dbRefs = new JSONArray();
|
||||
for(DBRef dbRef : struc.getDBRefs()){
|
||||
JSONObject dbRefObject = new JSONObject();
|
||||
dbRefObject.put("ChainID", dbRef.getChainId());
|
||||
dbRefObject.put("SeqBegin", dbRef.getSeqBegin());
|
||||
dbRefObject.put("SeqEnd", dbRef.getSeqEnd());
|
||||
dbRefObject.put("DbName", dbRef.getDatabase());
|
||||
dbRefObject.put("DbAccession", dbRef.getDbAccession());
|
||||
dbRefObject.put("DbSeqBegin", dbRef.getDbSeqBegin());
|
||||
dbRefObject.put("DbSeqEnd", dbRef.getDbSeqEnd());
|
||||
dbRefs.put(dbRefObject);
|
||||
}
|
||||
doc.put("dbRefs", dbRefs);
|
||||
} catch (Exception e) {
|
||||
logger.error("parsing error in processing: " + pdbFilePath);
|
||||
logger.error(e.getMessage());
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
private void het_Handler(String line){
|
||||
JSONObject hetObject = new JSONObject();
|
||||
String hetID = line.substring(7, 10).trim();
|
||||
String chainID = line.substring(12, 13);
|
||||
String seqNum = line.substring(14, 17).trim();
|
||||
String numHetAtoms = line.substring(21, 25).trim();
|
||||
hetObject.put("HetID", hetID);
|
||||
hetObject.put("ChainID", chainID);
|
||||
hetObject.put("SeqNum", seqNum);
|
||||
hetObject.put("NumHetAtoms", numHetAtoms);
|
||||
write_to_doc("hets", hetObject);
|
||||
}
|
||||
|
||||
private void revdat_Handler(String line) throws ParseException {
|
||||
|
||||
JSONObject revdatObject = new JSONObject();
|
||||
String modNumber = line.substring(8, 10).trim();
|
||||
write_to_obj(revdatObject, modNumber, "Modificationumber");
|
||||
String modDateStr = line.substring (13, 22).trim();
|
||||
if(!modDateStr.equals("")){
|
||||
Date modDate = pdbdateformatter.parse(modDateStr);
|
||||
revdatObject.put("ModificationDate", formatter.format(modDate));
|
||||
}
|
||||
write_to_doc("revdats", revdatObject);
|
||||
}
|
||||
|
||||
private void seqres_Handler(String line){
|
||||
JSONObject seqresObject = new JSONObject();
|
||||
String chainID = line.substring(11, 12);
|
||||
String numRes = line.substring(13,17).trim();
|
||||
String acidSeq = line.substring(18).trim();
|
||||
seqresObject.put("chainID", chainID);
|
||||
write_to_obj(seqresObject, numRes, "NumRes");
|
||||
seqresObject.put("AcidSeq", acidSeq);
|
||||
write_to_doc("seqreses", seqresObject);
|
||||
}
|
||||
|
||||
private void modres_Handler(String line){
|
||||
JSONObject modresObject = new JSONObject();
|
||||
// MODRES : MODRES 10MH 5NC C 427 DC 5-AZA-CYTIDINE-5'MONOPHOSPHATE
|
||||
String resName = line.substring(12, 15).trim();
|
||||
String chainID = line.substring(16, 17);
|
||||
String seqNum = line.substring(18, 22).trim();
|
||||
String stdRes = line.substring(23, 27).trim();
|
||||
String modComment = line.substring(28).trim();
|
||||
modresObject.put("ChainID", chainID);
|
||||
modresObject.put("ResName", resName);
|
||||
write_to_obj(modresObject, seqNum, "SeqNumber");
|
||||
modresObject.put("StdRes", stdRes);
|
||||
modresObject.put("ModificationComment", modComment);
|
||||
write_to_doc("modreses", modresObject);
|
||||
|
||||
}
|
||||
|
||||
private void hetnam_Handler(String line){
|
||||
JSONObject hetnamObject = new JSONObject();
|
||||
String hetID = line.substring(10, 14).trim();
|
||||
String chemicalName = line.substring(15).trim();
|
||||
hetnamObject.put("HetID", hetID);
|
||||
hetnamObject.put("ChemicalName", chemicalName);
|
||||
write_to_doc("hetnams", hetnamObject);
|
||||
}
|
||||
|
||||
private void helix_Handler(String line){
|
||||
JSONObject helixObject = new JSONObject();
|
||||
String helixID = line.substring(12, 14).trim();
|
||||
if(line.trim().length() >= 40){
|
||||
String helixClass = line.substring(39, 40);
|
||||
write_to_obj(helixObject,helixClass,"HelixClass");
|
||||
}
|
||||
if (line.trim().length()>=76) {
|
||||
String helixLength = line.substring(72, 76).trim();
|
||||
write_to_obj(helixObject,helixLength, "HelixLength");
|
||||
}
|
||||
helixObject.put("HelixID", helixID);
|
||||
write_to_doc("helixes", helixObject);
|
||||
}
|
||||
|
||||
private void master_Handler(String line){
|
||||
JSONObject masterObject = new JSONObject();
|
||||
String numRemark = line.substring(11,15).trim();
|
||||
String numHet = line.substring(21, 25).trim();
|
||||
String numHelix = line.substring(26, 30).trim();
|
||||
String numSheet = line.substring(31, 35).trim();
|
||||
String numTurn = line.substring(36, 40).trim();
|
||||
String numSite = line.substring(41, 45).trim();
|
||||
String numXForm = line.substring(46, 50).trim();
|
||||
String numCoord = line.substring(51, 55).trim();
|
||||
String numtTer = line.substring(56, 60).trim();
|
||||
String numConect = line.substring(61, 65).trim();
|
||||
String numSeq = line.substring(66, 70).trim();
|
||||
write_to_obj(masterObject, numRemark, "NumRemark");
|
||||
write_to_obj(masterObject, numHet, "NumHet");
|
||||
write_to_obj(masterObject, numHelix, "numHelix");
|
||||
write_to_obj(masterObject, numSheet, "NumSheet");
|
||||
write_to_obj(masterObject, numTurn, "NumTurn");
|
||||
write_to_obj(masterObject, numSite, "NumSite");
|
||||
write_to_obj(masterObject, numXForm, "NumXForm");
|
||||
write_to_obj(masterObject, numCoord, "NumCoord");
|
||||
write_to_obj(masterObject, numtTer, "NumtTer");
|
||||
write_to_obj(masterObject, numConect, "NumConect");
|
||||
write_to_obj(masterObject, numSeq, "NumSeq");
|
||||
write_to_doc("masters", masterObject);
|
||||
}
|
||||
private static void write_to_obj(JSONObject obj, String str, String key){
|
||||
int val;
|
||||
try{
|
||||
val = Integer.parseInt(str);
|
||||
obj.put(key, val);
|
||||
}catch (NumberFormatException fe){
|
||||
logger.error("parse integer error with string: " + str);
|
||||
}
|
||||
}
|
||||
|
||||
private void write_to_doc(String array_name, JSONObject obj){
|
||||
if(doc.optJSONArray(array_name) != null){
|
||||
doc.getJSONArray(array_name).put(obj);
|
||||
}else{
|
||||
JSONArray arr = new JSONArray();
|
||||
arr.put(obj);
|
||||
doc.put(array_name, arr);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,348 @@
|
|||
package cn.piflow.bundle.microorganism.util;
|
||||
|
||||
|
||||
import org.biojava.bio.seq.io.ParseException;
|
||||
import org.json.JSONArray;
|
||||
import org.json.JSONObject;
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
|
||||
public class Pfam {
|
||||
|
||||
//Compulsory fields
|
||||
protected static final String IDENTIFICATION_TAG = "ID";
|
||||
protected static final String ACCESSION_TAG = "AC";
|
||||
protected static final String DEFINITION_TAG = "DE";
|
||||
protected static final String AUTHOR_TAG = "AU";
|
||||
protected static final String SEED_SOURCE_TAG = "SE";
|
||||
protected static final String STRUCTURE_SOURCE_TAG = "SS";
|
||||
protected static final String BUILD_METHOD_TAG = "BM";
|
||||
protected static final String SEARCH_METHOD_TAG = "SM";
|
||||
protected static final String GATHERING_THRESHOLD_TAG = "GA";
|
||||
protected static final String TRUSTED_CUTOFF_TAG = "TC";
|
||||
protected static final String NOISE_CUTOFF_TAG = "NC";
|
||||
protected static final String TYPE_TAG = "TP";
|
||||
protected static final String SEQUENCE_TAG = "SQ";
|
||||
//Optional fields
|
||||
protected static final String DATABASE_COMMENT_TAG = "DC";
|
||||
protected static final String DATABASE_REFERENCE_TAG = "DR";
|
||||
protected static final String REF_COMMENT_TAG = "RC";
|
||||
protected static final String REF_NUMBER_TAG = "RN";
|
||||
protected static final String REF_MEDLINE_TAG = "RM";
|
||||
protected static final String REF_TITLE_TAG = "RT";
|
||||
protected static final String REF_AUTHOR_TAG = "RA";
|
||||
protected static final String REF_LOCATION_TAG = "RL";
|
||||
protected static final String PRE_IDENTIFIER_TAG = "PI";
|
||||
protected static final String KEYWORDS_TAG = "KW";
|
||||
protected static final String COMMENT_TAG = "CC";
|
||||
protected static final String PFAM_ACCESSION_TAG = "NE"; //INDICATES A NESTED DOMAIN
|
||||
protected static final String LOCATION_TAG = "NL"; //location of nested domain-sequence ID,start and end of insert
|
||||
protected static final String WIKI_LINK_TAG = "WK";
|
||||
protected static final String CLAN_TAG = "CL"; //clan accession
|
||||
protected static final String MEMBERSHIP_TAG = "MB";
|
||||
|
||||
protected static final String END_SEQUENCE_TAG = "//";
|
||||
|
||||
protected static final String STOCKHOLM_TAG = "# STOCKHOLM 1.0";
|
||||
|
||||
protected static final Pattern gsseqp = Pattern.compile("^(.+)/(\\d+)-(\\d+)\\s+([A-Z]{2})\\s+(.+)$");
|
||||
protected static final Pattern seqp = Pattern.compile("^(\\S+)/(\\d+-\\d+)\\s+(.+)$");
|
||||
protected static final Pattern gcp = Pattern.compile("^(\\S+)\\s+(.+)$");
|
||||
|
||||
private static boolean hasAnotherSequence = true;
|
||||
|
||||
public static boolean process(BufferedReader br, JSONObject doc) throws IOException {
|
||||
String sectionKey = null;
|
||||
String sectionVal = null;
|
||||
JSONArray dbRefs = new JSONArray();
|
||||
JSONArray gcLines = new JSONArray();
|
||||
JSONArray rankedRefs = new JSONArray();
|
||||
Map<String, JSONObject> sequences = new HashMap<String, JSONObject>();
|
||||
try {
|
||||
List section = null;
|
||||
do{
|
||||
section = readSection(br);
|
||||
sectionKey = ((String[]) section.get(0))[0];
|
||||
sectionVal = ((String[]) section.get(0))[1];
|
||||
if(section.size() == 1 && sectionVal != null){
|
||||
sectionVal = sectionVal.trim();
|
||||
if(sectionVal.contains("\n")){
|
||||
sectionVal = sectionVal.replaceAll("\\n", "");
|
||||
}
|
||||
if(sectionVal.endsWith(";")) sectionVal = sectionVal.substring(0, sectionVal.length()-1);
|
||||
|
||||
if(sectionKey.equals(IDENTIFICATION_TAG)){
|
||||
doc.put("identification", sectionVal);
|
||||
}else if(sectionKey.equals(ACCESSION_TAG)){
|
||||
doc.put("accession", sectionVal);
|
||||
}else if(sectionKey.equals(DEFINITION_TAG)){
|
||||
doc.put("definition", sectionVal);
|
||||
}else if(sectionKey.equals(AUTHOR_TAG)){
|
||||
doc.put("author", sectionVal);
|
||||
}else if(sectionKey.equals(GATHERING_THRESHOLD_TAG)){
|
||||
doc.put("gathering_threshold", sectionVal);
|
||||
}else if(sectionKey.equals(TRUSTED_CUTOFF_TAG)){
|
||||
doc.put("trusted_cutoff", sectionVal);
|
||||
}else if(sectionKey.equals(NOISE_CUTOFF_TAG)){
|
||||
doc.put("noise_cutoff", sectionVal);
|
||||
}else if(sectionKey.equals(TYPE_TAG)){
|
||||
doc.put("type", sectionVal);
|
||||
}else if(sectionKey.equals(SEQUENCE_TAG)){
|
||||
doc.put("sequence_length", Integer.parseInt(sectionVal));
|
||||
}else if(sectionKey.equals(SEED_SOURCE_TAG)){
|
||||
doc.put("seed_source", sectionVal);
|
||||
}else if(sectionKey.equals(STRUCTURE_SOURCE_TAG)){
|
||||
doc.put("struc_source", sectionVal);
|
||||
}else if(sectionKey.equals(BUILD_METHOD_TAG)){
|
||||
doc.put("build_method", sectionVal);
|
||||
}else if(sectionKey.equals(SEARCH_METHOD_TAG)){
|
||||
doc.put("search_method", sectionVal);
|
||||
}else if(sectionKey.equals(PRE_IDENTIFIER_TAG)){
|
||||
doc.put("pre_identifier", sectionVal);
|
||||
}else if(sectionKey.equals(KEYWORDS_TAG)){
|
||||
doc.put("keywords", sectionVal);
|
||||
}else if(sectionKey.equals(COMMENT_TAG)){
|
||||
doc.put("comment", sectionVal);
|
||||
}else if(sectionKey.equals(PFAM_ACCESSION_TAG)){
|
||||
doc.put("pfam_accession", sectionVal);
|
||||
}else if(sectionKey.equals(LOCATION_TAG)){
|
||||
doc.put("location", sectionVal);
|
||||
}else if(sectionKey.equals(WIKI_LINK_TAG)){
|
||||
doc.put("wiki_link", sectionVal);
|
||||
}else if(sectionKey.equals(CLAN_TAG)){
|
||||
doc.put("clan", sectionVal);
|
||||
}else if(sectionKey.equals(MEMBERSHIP_TAG)){
|
||||
doc.put("membership", sectionVal);
|
||||
}else if(sectionKey.equals(DATABASE_REFERENCE_TAG)){
|
||||
JSONObject dbRef = new JSONObject();
|
||||
String[] parts = sectionVal.split(";");
|
||||
dbRef.put("databaseName", parts[0].trim());
|
||||
dbRef.put("databaseID", parts[1].trim());
|
||||
dbRefs.put(dbRef);
|
||||
}else if(sectionKey.equals("STOCKHOLM")){
|
||||
//do nothing
|
||||
}else if(sectionKey.equals("GSMarkUp")){
|
||||
JSONObject seq = new JSONObject();
|
||||
Matcher m = gsseqp.matcher(sectionVal);
|
||||
if(m.matches()){
|
||||
String seq_name = m.group(1);
|
||||
String seq_start = m.group(2);
|
||||
String seq_end = m.group(3);
|
||||
String feature_tag = m.group(4);
|
||||
String feature_value = m.group(5);
|
||||
seq.put("seq_name", seq_name);
|
||||
seq.put("seq_start",seq_start);
|
||||
seq.put("seq_end", seq_end);
|
||||
seq.put(feature_tag, feature_value);
|
||||
sequences.put(seq_name, seq);
|
||||
}
|
||||
}else if(sectionKey.equals("Sequence")){ // sequence line with no previous tag
|
||||
Matcher m = seqp.matcher(sectionVal);
|
||||
if(m.matches()){
|
||||
String seq_name = m.group(1);
|
||||
String sequence = m.group(3);
|
||||
if(sequences.containsKey(seq_name)){
|
||||
sequences.get(seq_name).put("sequence", sequence);
|
||||
}
|
||||
}
|
||||
}else if(sectionKey.equals("GRMarkUp")){ // more info just below the sequence
|
||||
Matcher m = gsseqp.matcher(sectionVal);
|
||||
if(m.matches()){
|
||||
String seq_name = m.group(1);
|
||||
String feature_tag = m.group(4);
|
||||
String feature_val = m.group(5);
|
||||
if(sequences.containsKey(seq_name)){
|
||||
sequences.get(seq_name).put(feature_tag, feature_val);
|
||||
}
|
||||
}
|
||||
}else if(sectionKey.equals("GCMarkUp")){
|
||||
Matcher m = gcp.matcher(sectionVal);
|
||||
if(m.matches()){
|
||||
JSONObject gcObj = new JSONObject();
|
||||
String feature_tag = m.group(1);
|
||||
String feature_val = m.group(2);
|
||||
gcObj.put(feature_tag, feature_val);
|
||||
gcLines.put(gcObj);
|
||||
}
|
||||
}else{
|
||||
String name = sectionKey.toLowerCase();
|
||||
doc.put(name, sectionVal);
|
||||
}
|
||||
} else if(section.size() > 1){
|
||||
sectionKey = ((String[]) section.get(0))[0];
|
||||
if(sectionKey.equals(REF_NUMBER_TAG)){ // this section is a reference
|
||||
JSONObject refObj = new JSONObject();
|
||||
String refRank = sectionVal.trim();
|
||||
refRank = refRank.substring(1, refRank.length()-1);
|
||||
int ref_rank = Integer.parseInt(refRank);
|
||||
String medlineID = null;
|
||||
String title = null;
|
||||
String authors = null;
|
||||
String location = null;
|
||||
String comment = null;
|
||||
for(int i = 1; i < section.size(); i++){
|
||||
String key = ((String[])section.get(i))[0];
|
||||
String val = ((String[])section.get(i))[1].trim();
|
||||
if(val.contains("\n")){
|
||||
val = val.replaceAll("\\n", "");
|
||||
}
|
||||
if (val.endsWith(";")) val = val.substring(0, val.length()-1);
|
||||
if(key.equals(REF_MEDLINE_TAG)){medlineID = val;}
|
||||
if(key.equals(REF_TITLE_TAG)){title = val;}
|
||||
if(key.equals(REF_AUTHOR_TAG)){authors = val;}
|
||||
if(key.equals(REF_LOCATION_TAG)){location = val;}
|
||||
if(key.equals(REF_COMMENT_TAG)){
|
||||
comment = val;
|
||||
}
|
||||
}
|
||||
refObj.put("rank", ref_rank);
|
||||
refObj.put("medlineID", Integer.parseInt(medlineID));
|
||||
refObj.put("title", title);
|
||||
refObj.put("authors", authors);
|
||||
refObj.put("location", location);
|
||||
refObj.put("comment", comment);
|
||||
rankedRefs.put(refObj);
|
||||
}else if(sectionKey.equals(DATABASE_REFERENCE_TAG)){
|
||||
JSONObject dbRef = new JSONObject();
|
||||
String[] parts = sectionVal.split(";");
|
||||
dbRef.put("databaseName", parts[0].trim());
|
||||
dbRef.put("databaseID", parts[1].trim());
|
||||
|
||||
StringBuffer comment = new StringBuffer();
|
||||
for(int i = 1; i < section.size(); i++){
|
||||
String key = ((String[])section.get(i))[0];
|
||||
String val = ((String[])section.get(i))[1].trim();
|
||||
if(key.equals(DATABASE_COMMENT_TAG)){
|
||||
comment.append(val);
|
||||
}
|
||||
}
|
||||
dbRef.put("comment", comment.toString());
|
||||
dbRefs.put(dbRef);
|
||||
}
|
||||
}
|
||||
}while (!sectionKey.equals(END_SEQUENCE_TAG));
|
||||
doc.put("dbRefs", dbRefs);
|
||||
doc.put("gcLines", gcLines);
|
||||
JSONArray sequencesArr = new JSONArray();
|
||||
for(Map.Entry<String, JSONObject> entry : sequences.entrySet()){
|
||||
sequencesArr.put(entry.getValue());
|
||||
}
|
||||
// TODO: 2016/6/24 solve the outofmemory error as the sequences are too large, and we keep two copy right now. when put to doc, would get 3 copy.
|
||||
doc.put("sequences", sequencesArr);
|
||||
doc.put("rankedRefs", rankedRefs);
|
||||
} catch (ParseException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
||||
while (true){
|
||||
br.mark(1);
|
||||
int c = br.read();
|
||||
if (c == -1) {
|
||||
hasAnotherSequence = false;
|
||||
break;
|
||||
}
|
||||
if (Character.isWhitespace((char) c)) {
|
||||
continue;
|
||||
}
|
||||
br.reset();
|
||||
break;
|
||||
}
|
||||
return hasAnotherSequence;
|
||||
}
|
||||
|
||||
public static List readSection(BufferedReader br) throws ParseException {
|
||||
List section = new ArrayList();
|
||||
String line;
|
||||
boolean done = false;
|
||||
|
||||
try {
|
||||
while (!done) {
|
||||
br.mark(160);
|
||||
line = br.readLine();
|
||||
|
||||
if(line.equals(STOCKHOLM_TAG)){
|
||||
done = true;
|
||||
section.add(new String[]{"STOCKHOLM", null});
|
||||
}else if(line.startsWith("#=GF")){
|
||||
String token = line.substring(5,7);
|
||||
if(token.equals(DEFINITION_TAG)){
|
||||
section.add(new String[]{DEFINITION_TAG, line.substring(8)});
|
||||
done = true;
|
||||
}else if(token.charAt(0) == 'R' || token.charAt(0) == 'D'){
|
||||
br.reset();
|
||||
String currentTag = null;
|
||||
char currentTagStart = '\0';
|
||||
StringBuffer currentVal = null;
|
||||
|
||||
while(!done){
|
||||
br.mark(160);
|
||||
line = br.readLine();
|
||||
|
||||
if (currentTagStart=='\0') currentTagStart = line.charAt(5);
|
||||
if (!line.startsWith("#=GF "+currentTagStart) ||
|
||||
(currentTagStart=='R' && currentTag!=null && line.substring(5,7).equals(REF_NUMBER_TAG)) ||
|
||||
(currentTagStart=='D' && currentTag!=null && line.substring(5,7).equals(DATABASE_REFERENCE_TAG))) {
|
||||
br.reset();
|
||||
done = true;
|
||||
// dump current tag if exists
|
||||
if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()});
|
||||
} else {
|
||||
try {
|
||||
String tag = line.substring(5, 7);
|
||||
String value = line.substring(8);
|
||||
if (currentTag==null || !tag.equals(currentTag)) {
|
||||
if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()});
|
||||
currentTag = tag;
|
||||
currentVal = new StringBuffer();
|
||||
currentVal.append(value);
|
||||
} else {
|
||||
currentVal.append("\n");
|
||||
currentVal.append(value);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new ParseException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}else { //values in multiple lines or single line
|
||||
StringBuffer currentVal = new StringBuffer();
|
||||
currentVal.append(line.substring(8).trim());
|
||||
while (!done) {
|
||||
br.mark(160);
|
||||
line = br.readLine();
|
||||
if(!line.substring(5,7).equals(token) || !line.startsWith("#=GF")){ // the following #=GS line might have "SQ" in the right position
|
||||
br.reset();
|
||||
done = true;
|
||||
section.add(new String[]{token,currentVal.toString()});
|
||||
}else{
|
||||
currentVal.append("\n");
|
||||
currentVal.append(line.substring(8).trim());
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if(line.startsWith("#=GS")){
|
||||
done = true;
|
||||
section.add(new String[]{"GSMarkUp", line.substring(5).trim()});
|
||||
} else if(line.startsWith("#=GR")){
|
||||
done = true;
|
||||
section.add(new String[]{"GRMarkUp", line.substring(5).trim()});
|
||||
} else if(line.startsWith("#=GC")){
|
||||
done = true;
|
||||
section.add(new String[]{"GCMarkUp", line.substring(5).trim()});
|
||||
} else if(line.startsWith(END_SEQUENCE_TAG)){
|
||||
section.add(new String[]{END_SEQUENCE_TAG,null});
|
||||
done = true;
|
||||
} else{ //no prefix tag
|
||||
done = true;
|
||||
section.add(new String[]{"Sequence", line.trim()});
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
return section;
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue