From 2d1685af1af9f8139f4a18467b9b88f24830eed9 Mon Sep 17 00:00:00 2001 From: yanfqidong0604 Date: Fri, 21 Dec 2018 12:20:58 +0800 Subject: [PATCH 1/3] Analysis of microbial data EMBL, Refseq_genome, Refseq_protein and related components. FTP server download, HDFS file screening, HDFS file decompression optimization. QiDong Yang --- .../resources/microorganism/EMBL_Logo.svg | 76 + .../main/resources/microorganism/down.json | 31 + .../resources/microorganism/embl_parser.json | 67 + .../main/resources/microorganism/refseq.png | Bin 0 -> 1561 bytes .../microorganism/refseq_genome.json | 67 + .../resources/microorganism/select_unzip.json | 37 + .../piflow/bundle/ftp/LoadFromFtpToHDFS.scala | 141 ++ .../piflow/bundle/ftp/SelectFilesByName.scala | 109 ++ .../piflow/bundle/http/UnzipFilesOnHDFS.scala | 219 ++- .../bundle/microorganism/EmblParser.scala | 168 +++ .../bundle/microorganism/RefseqParser.scala | 168 +++ .../microorganism/util/CustomEMBLFormat.java | 1151 +++++++++++++++ .../util/CustomEnsemblFormat.java | 1133 +++++++++++++++ .../microorganism/util/CustomIOTools.java | 20 +- .../util/CustomUniProtFormat.java | 1291 +++++++++++++++++ .../bundle/microorganism/util/ProcessNew.java | 571 ++++++++ .../scala/cn/piflow/bundle/ftp/emblTest.scala | 87 ++ 17 files changed, 5198 insertions(+), 138 deletions(-) create mode 100644 piflow-bundle/src/main/resources/microorganism/EMBL_Logo.svg create mode 100644 piflow-bundle/src/main/resources/microorganism/down.json create mode 100644 piflow-bundle/src/main/resources/microorganism/embl_parser.json create mode 100644 piflow-bundle/src/main/resources/microorganism/refseq.png create mode 100644 piflow-bundle/src/main/resources/microorganism/refseq_genome.json create mode 100644 piflow-bundle/src/main/resources/microorganism/select_unzip.json create mode 100644 piflow-bundle/src/main/scala/cn/piflow/bundle/ftp/LoadFromFtpToHDFS.scala create mode 100644 piflow-bundle/src/main/scala/cn/piflow/bundle/ftp/SelectFilesByName.scala create mode 100644 piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/EmblParser.scala create mode 100644 piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/RefseqParser.scala create mode 100644 piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/util/CustomEMBLFormat.java create mode 100644 piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/util/CustomEnsemblFormat.java create mode 100644 piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/util/CustomUniProtFormat.java create mode 100644 piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/util/ProcessNew.java create mode 100644 piflow-bundle/src/test/scala/cn/piflow/bundle/ftp/emblTest.scala diff --git a/piflow-bundle/src/main/resources/microorganism/EMBL_Logo.svg b/piflow-bundle/src/main/resources/microorganism/EMBL_Logo.svg new file mode 100644 index 0000000..20959c9 --- /dev/null +++ b/piflow-bundle/src/main/resources/microorganism/EMBL_Logo.svg @@ -0,0 +1,76 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/piflow-bundle/src/main/resources/microorganism/down.json b/piflow-bundle/src/main/resources/microorganism/down.json new file mode 100644 index 0000000..8c60db1 --- /dev/null +++ b/piflow-bundle/src/main/resources/microorganism/down.json @@ -0,0 +1,31 @@ +{ + "flow":{ + "name":"test", + "uuid":"1234", + "stops":[ + { + "uuid":"1111", + "name":"LoadFromFtpToHDFS", + "bundle":"cn.piflow.bundle.ftp.LoadFromFtpToHDFS", + "properties":{ + "url_str":"ftp.ebi.ac.uk", + "port":"", + "username":"", + "password":"", + "ftpFile":"/pub/databases/ena/sequence/release/con/rel_con_env_07_r138.dat.gz", + "HDFSUrl":"hdfs://10.0.88.70:9000", + "HDFSPath":"/yqd/weishengwu/embl/", + "isFile":"true" + } + } + ], + "paths":[ + { + "from":"", + "outport":"", + "inport":"", + "to":"" + } + ] + } +} \ No newline at end of file diff --git a/piflow-bundle/src/main/resources/microorganism/embl_parser.json b/piflow-bundle/src/main/resources/microorganism/embl_parser.json new file mode 100644 index 0000000..dc40445 --- /dev/null +++ b/piflow-bundle/src/main/resources/microorganism/embl_parser.json @@ -0,0 +1,67 @@ +{ + "flow":{ + "name":"test", + "uuid":"1234", + "stops":[ + + { + "uuid":"1111", + "name":"SelectFilesByName", + "bundle":"cn.piflow.bundle.ftp.SelectFilesByName", + "properties":{ + "HDFSUrl":"hdfs://10.0.88.70:9000", + "HDFSPath":"/yqd/weishengwu/embl", + "selectionConditions":".*con_pro_02_r138.dat.gz,.*con_vrl_01_r138.dat.gz,.*pat_phg_01_r138.dat.gz" + } + },{ + "uuid":"2222", + "name":"UnzipFilesOnHDFS_1", + "bundle":"cn.piflow.bundle.http.UnzipFilesOnHDFS_1", + "properties":{ + "isCustomize":"false", + "filePath":"", + "fileType":"gz", + "unzipPath":"" + + } + }, + { + "uuid":"3333", + "name":"EmblParser", + "bundle":"cn.piflow.bundle.microorganism.EmblParser", + "properties":{ + } + },{ + "uuid":"4444", + "name":"PutEs", + "bundle":"cn.piflow.bundle.es.PutEs", + "properties":{ + "es_nodes": "10.0.88.70,10.0.88.71,10.0.88.72", + "port": "9200", + "es_index": "embl", + "es_type": "embl" + } + } + ], + "paths":[ + { + "from":"SelectFilesByName", + "outport":"", + "inport":"", + "to":"UnzipFilesOnHDFS_1" + }, + { + "from":"UnzipFilesOnHDFS_1", + "outport":"", + "inport":"", + "to":"EmblParser" + }, + { + "from":"EmblParser", + "outport":"", + "inport":"", + "to":"PutEs" + } + ] + } +} \ No newline at end of file diff --git a/piflow-bundle/src/main/resources/microorganism/refseq.png b/piflow-bundle/src/main/resources/microorganism/refseq.png new file mode 100644 index 0000000000000000000000000000000000000000..5448ab907620939434f07a5da5a35f4304f40297 GIT binary patch literal 1561 zcmV+!2Il#RP)Px#1ZP1_K>z@;j|==^1poj532;bRa{vGi!vFvd!vV){sAK>D1+Ga%K~#8N?OGwz zt2z+nUm%f{l}IEKi9{li`~)Nt$;>m%Q_NFHBobMPL?V&rZGb>%1GL}$dT&2&&diFE zG;H={H@n+vP&fq?ktcy7@+441o&<`>lRy!95-1{10!8FWa4I6ZTaL)HaM3krng+uT zP&Cuv$?-CgRWusuSae)02Y#HPYWwNi??aP&FmxAL)1!{fMGwCKe|y&yNPGvTvFK@- z792lDUNZ?g4RnIrM_zDDBE z_voEwn1fz7f*shY9CF-oe~w)Fr|}jd`=&xV)N!o4;v=aJJdwPH)qyOmJ({CK?v3x! z!}7uK$3zmyps{^V<|KxVAieYW6*?3?`|cH?>?F+|Z(}ZF8G5qMKr-naJnBsoPzH-m zA3~%|E{Jy1M2;o=1{?F2d(OvwbbgA+0Ud66Bs3}R#BdNx4qRV`!-3o`ohpRW8do7a zF5RU&f<*{d#Ma)Q#kYGT@3IWf%@~J~|ELaRm^b`b^gmA|M~8ZoP#x~7X~4X38ry6X zw8)%kj+>93-4*cpntREUK;*CXA>I&Lmre>3lPte?*2Jvx8M)u^>qK&NXo?u_oLF)3 zkk5{}c-vFLNvp#7nJ#ZPjreRL8R$-$?gvdXNy%+0zfQ!6U;X|Na#OKytSzm?R%kK4 zU}>>--o5)ipGd;o8Y3}je1pps5nFf34t9xTMCjpuM`yq~@t8>yi46Z%A_;T_{@^O^C^kwDg$pbBHAH8j*=EGloF+bwA5(GY?*|revSU9PTjo zL|2I)sJl?y75?n0-jW49g4K-<*I)S6XD_sW9+7*ewMQ*vv9f-rShCp3ZF+i#CWC9d zT1N^x>Netf%c7KNvwsIj6SD>x?+od|#y-bP-4r7VJ491U&Loy(Z|>=kjJp3RBH2Fx zia_XLQvH=wXy%{Z??9(Dyz;rM-4-CL*X^-}HZ_u|!y4LIe6&YfCGcz_@%{vK6=I>6 z`>2C>*q9h_;zC%PZRLg{vZ1r~Te)!>#mN}FNoIhHtZ7cz)nmRB8Ijk4AlA5!eFv6x z5%d_F@t0+C;7hKNu^{wee=%2ipGprWMdFz4EUBJ#IG_}NP)&vctipY~d5qT24 zMC3pJ{-@+aMS0{&polyP6p<%^BJw0qM4kkS$df=3c@iihPXXY6JtvUg5V=`^00000 LNkvXXu0mjfPAK9j literal 0 HcmV?d00001 diff --git a/piflow-bundle/src/main/resources/microorganism/refseq_genome.json b/piflow-bundle/src/main/resources/microorganism/refseq_genome.json new file mode 100644 index 0000000..98f997a --- /dev/null +++ b/piflow-bundle/src/main/resources/microorganism/refseq_genome.json @@ -0,0 +1,67 @@ +{ + "flow":{ + "name":"test", + "uuid":"1234", + "stops":[ + + { + "uuid":"1111", + "name":"SelectFilesByName", + "bundle":"cn.piflow.bundle.ftp.SelectFilesByName", + "properties":{ + "HDFSUrl":"hdfs://10.0.88.70:9000", + "HDFSPath":"/yqd/weishengwu/refseq/", + "selectionConditions":".*genomic.gbff.gz" + } + },{ + "uuid":"2222", + "name":"UnzipFilesOnHDFS_1", + "bundle":"cn.piflow.bundle.http.UnzipFilesOnHDFS_1", + "properties":{ + "isCustomize":"false", + "filePath":"", + "fileType":"gz", + "unzipPath":"" + + } + }, + { + "uuid":"3333", + "name":"Refseq_genomeParser", + "bundle":"cn.piflow.bundle.microorganism.Refseq_genomeParser", + "properties":{ + } + },{ + "uuid":"4444", + "name":"PutEs", + "bundle":"cn.piflow.bundle.es.PutEs", + "properties":{ + "es_nodes": "10.0.88.70,10.0.88.71,10.0.88.72", + "port": "9200", + "es_index": "genome", + "es_type": "archaea" + } + } + ], + "paths":[ + { + "from":"SelectFilesByName", + "outport":"", + "inport":"", + "to":"UnzipFilesOnHDFS_1" + }, + { + "from":"UnzipFilesOnHDFS_1", + "outport":"", + "inport":"", + "to":"Refseq_genomeParser" + }, + { + "from":"Refseq_genomeParser", + "outport":"", + "inport":"", + "to":"PutEs" + } + ] + } +} \ No newline at end of file diff --git a/piflow-bundle/src/main/resources/microorganism/select_unzip.json b/piflow-bundle/src/main/resources/microorganism/select_unzip.json new file mode 100644 index 0000000..29c65d2 --- /dev/null +++ b/piflow-bundle/src/main/resources/microorganism/select_unzip.json @@ -0,0 +1,37 @@ +{ + "flow":{ + "name":"test", + "uuid":"1234", + "stops":[ + { + "uuid":"0000", + "name":"SelectFilesByName", + "bundle":"cn.piflow.bundle.ftp.SelectFilesByName", + "properties":{ + "HDFSUrl":"hdfs://10.0.88.70:9000", + "HDFSPath":"/yqd/", + "selectionConditions":".*genomic.gbff.gz" + } + },{ + "uuid":"1111", + "name":"UnzipFilesOnHDFS_1", + "bundle":"cn.piflow.bundle.http.UnzipFilesOnHDFS_1", + "properties":{ + "isCustomize":"true", + "filePath":"hdfs://10.0.88.70:9000/yqd/archaea.1.genomic.gbff.gz", + "fileType":"gz", + "unzipPath":"hdfs://10.0.88.70:9000/yqd/weishengwu/" + + } + } + ], + "paths":[ + { + "from":"SelectFilesByName", + "outport":"", + "inport":"", + "to":"UnzipFilesOnHDFS_1" + } + ] + } +} \ No newline at end of file diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/ftp/LoadFromFtpToHDFS.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/ftp/LoadFromFtpToHDFS.scala new file mode 100644 index 0000000..211ba74 --- /dev/null +++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/ftp/LoadFromFtpToHDFS.scala @@ -0,0 +1,141 @@ +package cn.piflow.bundle.ftp + +import cn.piflow.conf.bean.PropertyDescriptor +import cn.piflow.conf.util.{ImageUtil, MapUtil} +import cn.piflow.conf.{ConfigurableStop, PortEnum} +import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} +import org.apache.commons.net.ftp.{FTP, FTPClient, FTPClientConfig, FTPFile} +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.{FSDataOutputStream, FileSystem, Path} + +class LoadFromFtpToHDFS extends ConfigurableStop { + override val authorEmail: String = "yangqidong@cnic.cn" + override val description: String = "Load file from ftp server save on HDFS" + override val inportList: List[String] = List(PortEnum.NonePort.toString) + override val outportList: List[String] = List(PortEnum.NonePort.toString) + + var url_str:String =_ + var port:String=_ + var username:String=_ + var password:String=_ + var ftpFile:String=_ + var HDFSUrl:String=_ + var HDFSPath:String=_ + var isFile:String=_ + + var fs: FileSystem=null + var con: FTPClientConfig =null + + def downFile(ftp: FTPClient,ftpFilePath:String,HDFSSavePath:String): Unit = { + + val changeFlag: Boolean = ftp.changeWorkingDirectory(ftpFilePath) + val files: Array[FTPFile] = ftp.listFiles() + for(x <- files ) { + if (x.isFile) { + println("down start ^^^ "+x.getName) + val hdfsPath: Path = new Path(HDFSSavePath + x.getName) + if(! fs.exists(hdfsPath)){ + var fdos: FSDataOutputStream = fs.create(hdfsPath) + ftp.retrieveFile(new String(x.getName.getBytes("GBK"),"ISO-8859-1"), fdos) + fdos.close() + } + } else { + downFile(ftp,ftpFilePath+x.getName+"/",HDFSSavePath+x.getName+"/") + } + } + + } + + + override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { + + val configuration: Configuration = new Configuration() + configuration.set("fs.defaultFS", HDFSUrl) + fs = FileSystem.get(configuration) + + val ftp:FTPClient = openFtpClient() + + if(isFile.equals("true")){ + val pathArr: Array[String] = ftpFile.split("/") + var dirPath:String="" + for(x <- (0 until pathArr.length-1)){ + dirPath += (pathArr(x)+"/") + } + ftp.changeWorkingDirectory(dirPath) + + var fdos: FSDataOutputStream = fs.create(new Path(HDFSPath+pathArr.last)) + ftp.retrieveFile(new String(pathArr.last.getBytes("GBK"),"ISO-8859-1"), fdos) + fdos.flush() + fdos.close() + }else{ + downFile(ftp,ftpFile,HDFSPath) + } + } + + def openFtpClient(): FTPClient = { + val ftp = new FTPClient + if(port.length > 0 ){ + ftp.connect(url_str,port.toInt) + }else{ + ftp.connect(url_str) + } + if(username.length > 0 && password.length > 0){ + ftp.login(username,password) + }else{ + ftp.login("anonymous", "121@hotmail.com") + } + ftp.setControlEncoding("GBK") + con = new FTPClientConfig(FTPClientConfig.SYST_NT) + con.setServerLanguageCode("zh") + ftp.setFileType(FTP.BINARY_FILE_TYPE) + ftp + } + + + override def setProperties(map: Map[String, Any]): Unit = { + url_str=MapUtil.get(map,key="url_str").asInstanceOf[String] + port=MapUtil.get(map,key="port").asInstanceOf[String] + username=MapUtil.get(map,key="username").asInstanceOf[String] + password=MapUtil.get(map,key="password").asInstanceOf[String] + ftpFile=MapUtil.get(map,key="ftpFile").asInstanceOf[String] + HDFSUrl=MapUtil.get(map,key="HDFSUrl").asInstanceOf[String] + HDFSPath=MapUtil.get(map,key="HDFSPath").asInstanceOf[String] + isFile=MapUtil.get(map,key="isFile").asInstanceOf[String] + } + + + override def getPropertyDescriptor(): List[PropertyDescriptor] = { + var descriptor : List[PropertyDescriptor] = List() + val url_str = new PropertyDescriptor().name("url_str").displayName("URL").defaultValue("IP of FTP server, such as 128.136.0.1 or ftp.ei.addfc.gak").required(true) + val port = new PropertyDescriptor().name("port").displayName("PORT").defaultValue("Port of FTP server").required(false) + val username = new PropertyDescriptor().name("username").displayName("USER_NAME").defaultValue("").required(false) + val password = new PropertyDescriptor().name("password").displayName("PASSWORD").defaultValue("").required(false) + val ftpFile = new PropertyDescriptor().name("ftpFile").displayName("FTP_File").defaultValue("The path of the file to the FTP server, such as /test/Ab/ or /test/Ab/test.txt").required(true) + val HDFSUrl = new PropertyDescriptor().name("HDFSUrl").displayName("HDFSUrl").defaultValue("The URL of the HDFS file system, such as hdfs://10.0.88.70:9000").required(true) + val HDFSPath = new PropertyDescriptor().name("HDFSPath").displayName("HDFSPath").defaultValue("The save path of the HDFS file system, such as /test/Ab/").required(true) + val isFile = new PropertyDescriptor().name("isFile").displayName("isFile").defaultValue("Whether the path is a file or not, if true is filled in, only a single file specified by the path is downloaded. If false is filled in, all files under the folder are downloaded recursively.").required(true) + descriptor = isFile :: descriptor + descriptor = url_str :: descriptor + descriptor = port :: descriptor + descriptor = username :: descriptor + descriptor = password :: descriptor + descriptor = ftpFile :: descriptor + descriptor = HDFSUrl :: descriptor + descriptor = HDFSPath :: descriptor + descriptor +} + + override def getIcon(): Array[Byte] = { + ImageUtil.getImage("ftp.png") + } + + override def getGroup(): List[String] = { + List(StopGroupEnum.FtpGroup.toString) + } + + override def initialize(ctx: ProcessContext): Unit = { + + } + + +} diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/ftp/SelectFilesByName.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/ftp/SelectFilesByName.scala new file mode 100644 index 0000000..d85a045 --- /dev/null +++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/ftp/SelectFilesByName.scala @@ -0,0 +1,109 @@ +package cn.piflow.bundle.ftp + +import java.util.regex.Pattern + +import cn.piflow.conf.bean.PropertyDescriptor +import cn.piflow.conf.util.{ImageUtil, MapUtil} +import cn.piflow.conf.{ConfigurableStop, PortEnum} +import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.{FileStatus, FileSystem, Path} +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.types.{StringType, StructField, StructType} +import org.apache.spark.sql.{DataFrame, Row, SparkSession} + +import scala.collection.mutable.ArrayBuffer + +class SelectFilesByName extends ConfigurableStop{ + override val authorEmail: String = "yangqidong@cnic.cn" + override val description: String = "Selecting files by file name" + override val inportList: List[String] = List(PortEnum.NonePort.toString) + override val outportList: List[String] = List(PortEnum.DefaultPort.toString) + + var HDFSUrl:String=_ + var HDFSPath:String=_ + var selectionConditions:String =_ + + var fs: FileSystem=null + var pathARR:ArrayBuffer[String]=ArrayBuffer() + var selectArr:Array[String]=null + + def selectFile(path: String): Unit = { + val statusesARR: Array[FileStatus] = fs.listStatus(new Path(path)) + for(each <- statusesARR){ + val pathStr = each.getPath.toString + if(each.isFile){ + val fileName: String = pathStr.split("/").last + selectArr = selectionConditions.split(",") + var b: Boolean =false + for(x <- selectArr){ + b = Pattern.matches(x,fileName) + if(b){ + pathARR += pathStr + } + } + }else{ + selectFile(pathStr) + } + } + } + + override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { + + val session: SparkSession = pec.get[SparkSession]() + + val configuration: Configuration = new Configuration() + configuration.set("fs.defaultFS", HDFSUrl) + fs = FileSystem.get(configuration) + + selectFile(HDFSPath) + + val rows: List[Row] = pathARR.map(each => { + var arr:Array[String]=Array(each) + val row: Row = Row.fromSeq(arr) + row + }).toList + val rowRDD: RDD[Row] = session.sparkContext.makeRDD(rows) + val fields: Array[StructField] = "path".split("/").map(d=>StructField(d,StringType,nullable = true)) + val schema: StructType = StructType(fields) + val df: DataFrame = session.createDataFrame(rowRDD,schema) + + + println("#################################################") + df.show(20) + println("#################################################") + + out.write(df) + } + + override def setProperties(map: Map[String, Any]): Unit = { + HDFSUrl=MapUtil.get(map,key="HDFSUrl").asInstanceOf[String] + HDFSPath=MapUtil.get(map,key="HDFSPath").asInstanceOf[String] + selectionConditions=MapUtil.get(map,key="selectionConditions").asInstanceOf[String] + } + + + override def getPropertyDescriptor(): List[PropertyDescriptor] = { + var descriptor : List[PropertyDescriptor] = List() + val HDFSUrl = new PropertyDescriptor().name("HDFSUrl").displayName("HDFSUrl").defaultValue("The URL of the HDFS file system, such as hdfs://10.0.88.70:9000").required(true) + val HDFSPath = new PropertyDescriptor().name("HDFSPath").displayName("HDFSPath").defaultValue("The save path of the HDFS file system, such as /test/Ab").required(true) + val selectionConditions = new PropertyDescriptor().name("selectionConditions").displayName("selectionConditions").defaultValue("To select conditions, you need to fill in regular expressions in java, such as. * abc. *").required(true) + descriptor = HDFSUrl :: descriptor + descriptor = HDFSPath :: descriptor + descriptor = selectionConditions :: descriptor + descriptor + } + + override def getIcon(): Array[Byte] = { + ImageUtil.getImage("ftp.png") + } + + override def getGroup(): List[String] = { + List(StopGroupEnum.FtpGroup.toString) + } + + override def initialize(ctx: ProcessContext): Unit = { + + } + +} diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/http/UnzipFilesOnHDFS.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/http/UnzipFilesOnHDFS.scala index 85d0eac..e3f7719 100644 --- a/piflow-bundle/src/main/scala/cn/piflow/bundle/http/UnzipFilesOnHDFS.scala +++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/http/UnzipFilesOnHDFS.scala @@ -12,178 +12,122 @@ import org.apache.spark.rdd.RDD import org.apache.spark.sql.types.{StringType, StructField, StructType} import org.apache.spark.sql.{DataFrame, Row, SparkSession} +import scala.collection.mutable.ArrayBuffer + class UnzipFilesOnHDFS extends ConfigurableStop { val authorEmail: String = "yangqidong@cnic.cn" val description: String = "Unzip files on HDFS" - val inportList: List[String] = List(PortEnum.NonePort.toString) + val inportList: List[String] = List(PortEnum.DefaultPort.toString) val outportList: List[String] = List(PortEnum.DefaultPort.toString) + var isCustomize:String=_ var filePath:String=_ var fileType:String=_ var unzipPath:String=_ - def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { - val session: SparkSession = pec.get[SparkSession]() + var session: SparkSession = null + + def unzipFile(hdfsFilePath: String, zipFileType: String, unzipHdfsPath: String):String = { + var zft: String = "" + if(zipFileType.length < 1){ + zft = hdfsFilePath.split("\\.").last + }else{ + zft = zipFileType + } val configuration: Configuration = new Configuration() - val pathARR: Array[String] = filePath.split("\\/") + val pathARR: Array[String] = hdfsFilePath.split("\\/") var hdfsUrl:String="" for (x <- (0 until 3)){ hdfsUrl+=(pathARR(x) +"/") } configuration.set("fs.defaultFS",hdfsUrl) - // configuration.set("dfs.nameservices", "nameservice1") - // configuration.set("dfs.ha.namenodes.nameservice1", "nn1,nn2"); - // configuration.set("dfs.namenode.rpc-address.nameservice1.nn1", "xxx:8020"); - // configuration.set("dfs.namenode.rpc-address.nameservice1.nn2", "xxx:8020"); - // configuration.set("dfs.client.failover.proxy.provider.nameservice1" - // ,"org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider"); - // configuration.addResource("classpath:/hadoop/core-site.xml"); - // configuration.addResource("classpath:/hadoop/hdfs-site.xml"); - // configuration.addResource("classpath:/hadoop/mapred-site.xml"); + var uhp : String="" + if(unzipHdfsPath.length < 1){ + for (x <- (0 until pathARR.length-1)){ + uhp+=(pathARR(x) +"/") + } + }else{ + uhp=unzipHdfsPath + } val fs = FileSystem.get(configuration) - val fdis: FSDataInputStream = fs.open(new Path(filePath)) - - - val filePathArr: Array[String] = filePath.split("/") + val fdis: FSDataInputStream = fs.open(new Path(hdfsFilePath)) + val filePathArr: Array[String] = hdfsFilePath.split("/") var fileName: String = filePathArr.last if(fileName.length == 0){ fileName = filePathArr(filePathArr.size-2) } - if(fileType.equals("gz")){ + var savePath:String="" + if(zft.equals("gz")){ val gzip: GZIPInputStream = new GZIPInputStream(fdis) var n = -1 val buf=new Array[Byte](10*1024*1024) - val savePath = new Path(unzipPath +fileName.replace(".gz","")) - val fdos = fs.create(savePath) + savePath = uhp +fileName.replace(".gz","") + val path = new Path(savePath) + val fdos = fs.create(path) while((n=gzip.read(buf)) != -1 && n != -1){ fdos.write(buf,0,n) fdos.flush() } - - - }/*else if(fileType.equals("tar")){ - - var entryNum:Int=0 - var entryFileName:String=null - var entryFile:File=null - var subEntryFile:File=null - var subEntryFileName:String=null - var tarArchiveEntries:Array[TarArchiveEntry]=null - var fileList:List[String]=List() - var fos:FileOutputStream=null - - var entry: TarArchiveEntry = null - val tarIs: TarArchiveInputStream = new TarArchiveInputStream(fdis) - while ((entry = tarIs.getNextTarEntry) != null && entry != null) { - entryFileName= localPath +File.separator+entry.getName() - entryFile=new File(entryFileName) - entryNum += 1 - if(entry.isDirectory()){ - if(!entryFile.exists()){ - entryFile.mkdirs() - } - tarArchiveEntries=entry.getDirectoryEntries() - for(i<-0 until tarArchiveEntries.length){ - subEntryFileName=entryFileName+File.separator+tarArchiveEntries(i).getName() - subEntryFile=new File(subEntryFileName) - fileList=subEntryFileName::fileList - fos=new FileOutputStream(subEntryFile) - var mark = -1 - val buf=new Array[Byte](4*1024) - while((mark=tarIs.read(buf)) != -1 && mark != -1){ - fos.write(buf,0,mark) - } - fos.close() - fos=null - } - }else{ - fileList = entryFileName :: fileList - fos=new FileOutputStream(entryFile) - var mark = -1 - val buf=new Array[Byte](4*1024) - while((mark=tarIs.read(buf)) != -1 && mark != -1){ - fos.write(buf,0,mark) - } - fos.close() - fos=null - } - - } - if(entryNum==0){ - println("there is no file!") - } - - }else if(fileType.equals("tar.gz")){ - - var entryNum:Int=0 - var entryFileName:String=null - var entryFile:File=null - var subEntryFile:File=null - var subEntryFileName:String=null - var tarArchiveEntries:Array[TarArchiveEntry]=null - var fileList:List[String]=List() - var fos:FileOutputStream=null - - var entry: TarArchiveEntry = null - val gzip:GZIPInputStream=new GZIPInputStream(fdis) - val tarIs: TarArchiveInputStream = new TarArchiveInputStream(gzip) - while ((entry = tarIs.getNextTarEntry) != null && entry != null) { - entryFileName=localPath +File.separator+entry.getName() - entryFile=new File(entryFileName) - entryNum += 1 - if(entry.isDirectory()){ - if(!entryFile.exists()){ - entryFile.mkdirs() - } - tarArchiveEntries=entry.getDirectoryEntries() - for(i<-0 until tarArchiveEntries.length){ - subEntryFileName=entryFileName+File.separator+tarArchiveEntries(i).getName() - subEntryFile=new File(subEntryFileName) - fileList=subEntryFileName::fileList - fos=new FileOutputStream(subEntryFile) - var mark = -1 - val buf=new Array[Byte](4*1024) - while((mark=tarIs.read(buf)) != -1 && mark != -1){ - fos.write(buf,0,mark) - } - fos.close() - fos=null - } - }else{ - fileList = entryFileName :: fileList - fos=new FileOutputStream(entryFile) - var mark = -1 - val buf=new Array[Byte](4*1024) - while((mark=tarIs.read(buf)) != -1 && mark != -1){ - fos.write(buf,0,mark) - } - fos.close() - fos=null - } - - } - if(entryNum==0){ - println("there is no file!") - } - }*/else{ + fdos.close() + gzip.close() + fdis.close() + }else{ throw new RuntimeException("File type fill in error, or do not support this type.") } - var seq:Seq[String]=Seq(unzipPath) - val row: Row = Row.fromSeq(seq) - val list:List[Row]=List(row) - val rdd: RDD[Row] = session.sparkContext.makeRDD(list) + savePath + + } + + def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { + + session = pec.get[SparkSession]() + + var savePath: String = "" + var arr:ArrayBuffer[Row]=ArrayBuffer() + + + if(isCustomize.equals("true")){ + println("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") + + savePath = unzipFile(filePath,fileType,unzipPath) + + + println("savepath : "+savePath) + + arr += Row.fromSeq(Array(savePath)) + + }else if (isCustomize.equals("false")){ + + val inDf: DataFrame = in.read() + inDf.collect().foreach(row => { + + filePath = row.get(0).asInstanceOf[String] + savePath = unzipFile(filePath,"","") + arr += Row.fromSeq(Array(savePath)) + savePath = "" + + }) + + } + + val rdd: RDD[Row] = session.sparkContext.makeRDD(arr.toList) val fields: Array[StructField] =Array(StructField("unzipPath",StringType,nullable = true)) val schema: StructType = StructType(fields) val df: DataFrame = session.createDataFrame(rdd,schema) + println("##################################################################################################") +// println(df.count()) + df.show(20) + println("##################################################################################################") + out.write(df) } @@ -193,6 +137,7 @@ class UnzipFilesOnHDFS extends ConfigurableStop { } def setProperties(map : Map[String, Any]) = { + isCustomize=MapUtil.get(map,key="isCustomize").asInstanceOf[String] filePath=MapUtil.get(map,key="filePath").asInstanceOf[String] fileType=MapUtil.get(map,key="fileType").asInstanceOf[String] unzipPath=MapUtil.get(map,key="unzipPath").asInstanceOf[String] @@ -201,9 +146,15 @@ class UnzipFilesOnHDFS extends ConfigurableStop { override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() - val filePath = new PropertyDescriptor().name("filePath").displayName("filePath").description("file path,such as hdfs://10.0.86.89:9000/a/a.gz").defaultValue("").required(true) - val fileType = new PropertyDescriptor().name("fileType").displayName("fileType").description("file type,such as gz").defaultValue("").required(true) + val filePath = new PropertyDescriptor().name("filePath").displayName("filePath").description("file path,such as hdfs://10.0.86.89:9000/a/a.gz").defaultValue("").required(false) + val fileType = new PropertyDescriptor().name("fileType").displayName("fileType").description("file type,such as gz").defaultValue("").required(false) val unzipPath = new PropertyDescriptor().name("unzipPath").displayName("unzipPath").description("unzip path, such as hdfs://10.0.86.89:9000/b/").defaultValue("").required(true) + val isCustomize = new PropertyDescriptor().name("isCustomize").displayName("isCustomize").description("Whether to customize the compressed file path, if true, " + + "you must specify the path where the compressed file is located and the saved path after decompression. " + + "If it is fals, it will automatically find the file path data from the upstream port and " + + "save it to the original folder after decompression.") + .defaultValue("").required(false) + descriptor = isCustomize :: descriptor descriptor = filePath :: descriptor descriptor = fileType :: descriptor descriptor = unzipPath :: descriptor @@ -216,7 +167,7 @@ class UnzipFilesOnHDFS extends ConfigurableStop { } override def getGroup(): List[String] = { - List(StopGroup.HttpGroup.toString) + List(StopGroupEnum.HttpGroup.toString) } } diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/EmblParser.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/EmblParser.scala new file mode 100644 index 0000000..f1adbe5 --- /dev/null +++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/EmblParser.scala @@ -0,0 +1,168 @@ +package cn.piflow.bundle.microorganism + +import java.io._ + +import cn.piflow.bundle.microorganism.util.{CustomIOTools, Process} +import cn.piflow.conf.bean.PropertyDescriptor +import cn.piflow.conf.util.ImageUtil +import cn.piflow.conf.{ConfigurableStop, PortEnum} +import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.{FSDataInputStream, FSDataOutputStream, FileSystem, Path} +import org.apache.spark.sql.{DataFrame, SparkSession} +import org.biojavax.bio.seq.{RichSequence, RichSequenceIterator} +import org.json.JSONObject + +class EmblParser extends ConfigurableStop{ + override val authorEmail: String = "yangqidong@cnic.cn" + override val description: String = "Parsing EMBL type data" + override val inportList: List[String] =List(PortEnum.DefaultPort.toString) + override val outportList: List[String] = List(PortEnum.DefaultPort.toString) + + + + override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { + + val session = pec.get[SparkSession]() + + val inDf: DataFrame = in.read() + val configuration: Configuration = new Configuration() + + var pathStr: String =inDf.take(1)(0).get(0).asInstanceOf[String] + val pathARR: Array[String] = pathStr.split("\\/") + var hdfsUrl:String="" + for (x <- (0 until 3)){ + hdfsUrl+=(pathARR(x) +"/") + } + configuration.set("fs.defaultFS",hdfsUrl) + var fs: FileSystem = FileSystem.get(configuration) + + val hdfsPathTemporary:String = hdfsUrl+"/Refseq_genomeParser_temporary.json" + val path: Path = new Path(hdfsPathTemporary) + + if(fs.exists(path)){ + fs.delete(path) + } + + fs.create(path).close() + var fdos: FSDataOutputStream = fs.append(path) + + var jsonStr: String ="" + + var bis: BufferedInputStream =null + + // var df: DataFrame =null + // var d: DataFrame =null + // var jsonRDD: RDD[String] =null + + inDf.collect().foreach(row => { + + var n : Int =0 + pathStr = row.get(0).asInstanceOf[String] + + println("#############################################") + println("start parser ^^^" + pathStr) + println("#############################################") + +// if(pathStr.equals("hdfs://10.0.88.70:9000/yqd/weishengwu/refseq/bacteria.1.genomic.gbff")) { + + + var fdis: FSDataInputStream = fs.open(new Path(pathStr)) + // var fdis: FSDataInputStream = fs.open(new Path("hdfs://10.0.88.70:9000/yqd/weishengwu/refseq/bacteria.1.1.genomic.fna.gz")) + + // var gzipout: GZIPInputStream = new GZIPInputStream(fdis) + + // var br: BufferedReader = new BufferedReader(new InputStreamReader(gzipout)) + + var br: BufferedReader = new BufferedReader(new InputStreamReader(fdis)) + + var sequences: RichSequenceIterator = CustomIOTools.IOTools.readEMBLDNA (br, null) + + while (sequences.hasNext) { + n += 1 + var seq: RichSequence = sequences.nextRichSequence() + var doc: JSONObject = new JSONObject + Process.processEMBL_EnsemblSeq(seq, doc) + jsonStr = doc.toString + println("start " + n) + + if (n == 1) { + bis = new BufferedInputStream(new ByteArrayInputStream(("[" + jsonStr).getBytes())) + } else { + bis = new BufferedInputStream(new ByteArrayInputStream(("," + jsonStr).getBytes())) + } + + val buff: Array[Byte] = new Array[Byte](1048576) + + var count: Int = bis.read(buff) + while (count != -1) { + fdos.write(buff, 0, count) + fdos.flush() + count = bis.read(buff) + } + + /* if(n==1){ + jsonRDD = session.sparkContext.makeRDD(jsonStr :: Nil) + df = session.read.json(jsonRDD) + }else{ + jsonRDD = session.sparkContext.makeRDD(jsonStr :: Nil) + d = session.read.json(jsonRDD) + df = df.union(d.toDF(df.columns:_*)) + }*/ + + fdos.flush() + bis = null + seq = null + doc = null + // jsonRDD = null + // d = null + } + bis = new BufferedInputStream(new ByteArrayInputStream(("]").getBytes())) + val buff: Array[Byte] = new Array[Byte](1048576) + + var count: Int = bis.read(buff) + while (count != -1) { + fdos.write(buff, 0, count) + fdos.flush() + count = bis.read(buff) + } + fdos.flush() +// } + }) + + fdos.close() + + println("start parser HDFSjsonFile") + val df: DataFrame = session.read.json(hdfsPathTemporary) + + println("############################################################") + // println(df.count()) + df.show(20) + println("############################################################") + out.write(df) + + + } + + override def setProperties(map: Map[String, Any]): Unit = { + + } + + override def getPropertyDescriptor(): List[PropertyDescriptor] ={ + var descriptor : List[PropertyDescriptor] = List() + descriptor + } + + override def getIcon(): Array[Byte] = { + ImageUtil.getImage("/microorganism/EMBL_Logo.svg") + } + + override def getGroup(): List[String] = { + List(StopGroupEnum.MicroorganismGroup.toString) + } + + override def initialize(ctx: ProcessContext): Unit = { + + } + +} diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/RefseqParser.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/RefseqParser.scala new file mode 100644 index 0000000..c82eb54 --- /dev/null +++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/RefseqParser.scala @@ -0,0 +1,168 @@ +package cn.piflow.bundle.microorganism + +import java.io._ + +import cn.piflow.bundle.microorganism.util.{CustomIOTools, Process} +import cn.piflow.conf.bean.PropertyDescriptor +import cn.piflow.conf.util.ImageUtil +import cn.piflow.conf.{ConfigurableStop, PortEnum} +import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.{FSDataInputStream, FSDataOutputStream, FileSystem, Path} +import org.apache.spark.sql.{DataFrame, SparkSession} +import org.biojavax.bio.seq.{RichSequence, RichSequenceIterator} +import org.json.JSONObject + +class RefseqParser extends ConfigurableStop{ + override val authorEmail: String = "yangqidong@cnic.cn" + override val description: String = "Parsing Refseq_genome type data" + override val inportList: List[String] =List(PortEnum.DefaultPort.toString) + override val outportList: List[String] = List(PortEnum.DefaultPort.toString) + + + + override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { + + val session = pec.get[SparkSession]() + + val inDf: DataFrame = in.read() + val configuration: Configuration = new Configuration() + + var pathStr: String =inDf.take(1)(0).get(0).asInstanceOf[String] + val pathARR: Array[String] = pathStr.split("\\/") + var hdfsUrl:String="" + for (x <- (0 until 3)){ + hdfsUrl+=(pathARR(x) +"/") + } + configuration.set("fs.defaultFS",hdfsUrl) + var fs: FileSystem = FileSystem.get(configuration) + + val hdfsPathTemporary:String = hdfsUrl+"/Refseq_genomeParser_temporary.json" + val path: Path = new Path(hdfsPathTemporary) + + if(fs.exists(path)){ + fs.delete(path) + } + + fs.create(path).close() + var fdos: FSDataOutputStream = fs.append(path) + + var jsonStr: String ="" + + var bis: BufferedInputStream =null + +// var df: DataFrame =null +// var d: DataFrame =null +// var jsonRDD: RDD[String] =null + + inDf.collect().foreach(row => { + + var n : Int =0 + pathStr = row.get(0).asInstanceOf[String] + + println("#############################################") + println("start parser ^^^" + pathStr) + println("#############################################") + +// if(pathStr.equals("hdfs://10.0.88.70:9000/yqd/weishengwu/refseq/bacteria.1.genomic.gbff")) { + + + var fdis: FSDataInputStream = fs.open(new Path(pathStr)) +// var fdis: FSDataInputStream = fs.open(new Path("hdfs://10.0.88.70:9000/yqd/weishengwu/refseq/bacteria.1.1.genomic.fna.gz")) + +// var gzipout: GZIPInputStream = new GZIPInputStream(fdis) + +// var br: BufferedReader = new BufferedReader(new InputStreamReader(gzipout)) + + var br: BufferedReader = new BufferedReader(new InputStreamReader(fdis)) + + var sequences: RichSequenceIterator = CustomIOTools.IOTools.readGenbankProtein(br, null) + + while (sequences.hasNext) { + n += 1 + var seq: RichSequence = sequences.nextRichSequence() + var doc: JSONObject = new JSONObject + Process.processSingleSequence(seq, doc) + jsonStr = doc.toString + println("start " + n) + + if (n == 1) { + bis = new BufferedInputStream(new ByteArrayInputStream(("[" + jsonStr).getBytes())) + } else { + bis = new BufferedInputStream(new ByteArrayInputStream(("," + jsonStr).getBytes())) + } + + val buff: Array[Byte] = new Array[Byte](1048576) + + var count: Int = bis.read(buff) + while (count != -1) { + fdos.write(buff, 0, count) + fdos.flush() + count = bis.read(buff) + } + + /* if(n==1){ + jsonRDD = session.sparkContext.makeRDD(jsonStr :: Nil) + df = session.read.json(jsonRDD) + }else{ + jsonRDD = session.sparkContext.makeRDD(jsonStr :: Nil) + d = session.read.json(jsonRDD) + df = df.union(d.toDF(df.columns:_*)) + }*/ + + fdos.flush() + bis = null + seq = null + doc = null + // jsonRDD = null + // d = null + } + bis = new BufferedInputStream(new ByteArrayInputStream(("]").getBytes())) + val buff: Array[Byte] = new Array[Byte](1048576) + + var count: Int = bis.read(buff) + while (count != -1) { + fdos.write(buff, 0, count) + fdos.flush() + count = bis.read(buff) + } + fdos.flush() +// } + }) + + fdos.close() + + println("start parser HDFSjsonFile") + val df: DataFrame = session.read.json(hdfsPathTemporary) + + println("############################################################") +// println(df.count()) + df.show(20) + println("############################################################") + out.write(df) + + + } + + override def setProperties(map: Map[String, Any]): Unit = { + + } + + override def getPropertyDescriptor(): List[PropertyDescriptor] ={ + var descriptor : List[PropertyDescriptor] = List() + descriptor +} + + override def getIcon(): Array[Byte] = { + ImageUtil.getImage("/microorganism/refseq.png") + } + + override def getGroup(): List[String] = { + List(StopGroupEnum.MicroorganismGroup.toString) + } + + override def initialize(ctx: ProcessContext): Unit = { + + } + +} diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/util/CustomEMBLFormat.java b/piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/util/CustomEMBLFormat.java new file mode 100644 index 0000000..f4cd463 --- /dev/null +++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/util/CustomEMBLFormat.java @@ -0,0 +1,1151 @@ +package cn.piflow.bundle.microorganism.util; + +import org.biojava.bio.seq.Sequence; +import org.biojava.bio.seq.io.ParseException; +import org.biojava.bio.seq.io.SeqIOListener; +import org.biojava.bio.seq.io.SymbolTokenization; +import org.biojava.bio.symbol.IllegalSymbolException; +import org.biojava.bio.symbol.SimpleSymbolList; +import org.biojava.bio.symbol.Symbol; +import org.biojava.bio.symbol.SymbolList; +import org.biojava.utils.ChangeVetoException; +import org.biojavax.*; +import org.biojavax.bio.seq.MultiSourceCompoundRichLocation; +import org.biojavax.bio.seq.RichFeature; +import org.biojavax.bio.seq.RichLocation; +import org.biojavax.bio.seq.RichSequence; +import org.biojavax.bio.seq.io.GenbankLocationParser; +import org.biojavax.bio.seq.io.RichSeqIOListener; +import org.biojavax.bio.seq.io.RichSequenceFormat; +import org.biojavax.bio.taxa.NCBITaxon; +import org.biojavax.bio.taxa.SimpleNCBITaxon; +import org.biojavax.ontology.ComparableTerm; +import org.biojavax.utils.StringTools; + +import java.io.*; +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Created by xiujuan on 2016/1/27. + */ +public class CustomEMBLFormat extends RichSequenceFormat.HeaderlessFormat { + // Register this format with the format auto-guesser. + static { + RichSequence.IOTools.registerFormat(CustomEMBLFormat.class); + } + + /** + * The name of the Pre-87 format + */ + public static final String EMBL_PRE87_FORMAT = "EMBL_PRE87"; + + /** + * The name of the current format + */ + public static final String EMBL_FORMAT = "EMBL"; + + protected static final String LOCUS_TAG = "ID"; + protected static final String ACCESSION_TAG = "AC"; + protected static final String VERSION_TAG = "SV"; + protected static final String DEFINITION_TAG = "DE"; + protected static final String DATE_TAG = "DT"; + protected static final String DATABASE_XREF_TAG = "DR"; + protected static final String SOURCE_TAG = "OS"; + protected static final String ORGANISM_TAG = "OC"; + protected static final String ORGANELLE_TAG = "OG"; + protected static final String REFERENCE_TAG = "RN"; + protected static final String REFERENCE_POSITION_TAG = "RP"; + protected static final String REFERENCE_XREF_TAG = "RX"; + protected static final String AUTHORS_TAG = "RA"; + protected static final String CONSORTIUM_TAG = "RG"; + protected static final String TITLE_TAG = "RT"; + protected static final String LOCATOR_TAG = "RL"; + protected static final String REMARK_TAG = "RC"; + protected static final String KEYWORDS_TAG = "KW"; + protected static final String COMMENT_TAG = "CC"; + protected static final String FEATURE_HEADER_TAG = "FH"; + protected static final String FEATURE_TAG = "FT"; + protected static final String CONTIG_TAG = "CO"; + protected static final String TPA_TAG = "AH"; + protected static final String START_SEQUENCE_TAG = "SQ"; + protected static final String DELIMITER_TAG = "XX"; + protected static final String END_SEQUENCE_TAG = "//"; + + // the date pattern + // date (Rel. N, Created) + // date (Rel. N, Last updated, Version M) + protected static final Pattern dp = Pattern.compile("([^\\s]+)\\s*(\\(Rel\\.\\s+(\\d+), ([^\\)\\d]+)(\\d*)\\))?$"); + // locus line + protected static final Pattern lp = Pattern.compile("^(\\S+);\\s+SV\\s+(\\d+);\\s+(linear|circular);\\s+(\\S+\\s?\\S+?);\\s+(\\S+);\\s+(\\S+);\\s+(\\d+)\\s+(BP|AA)\\.$"); + protected static final Pattern lpPre87 = Pattern.compile("^(\\S+)\\s+standard;\\s+(circular)?\\s*(genomic)?\\s*(\\S+);\\s+(\\S+);\\s+\\d+\\s+BP\\.$"); + // version line + protected static final Pattern vp = Pattern.compile("^(\\S+?)\\.(\\d+)$"); + // reference position line + protected static final Pattern rpp = Pattern.compile("^(\\d+)(-(\\d+))?,?(\\s?\\d+-\\d+,?)*$"); + // dbxref line + protected static final Pattern dbxp = Pattern.compile("^([^:]+):(\\S+)$"); + + protected static final Pattern readableFileNames = Pattern.compile(".*\\u002e(em|dat).*"); + protected static final Pattern headerLine = Pattern.compile("^ID.*"); + + private NCBITaxon tax = null; + private String organism = null; + private String accession = null; + + /** + * Implements some EMBL-specific terms. + */ + public static class Terms extends RichSequence.Terms { + + /** + * Getter for the RelUpdatedRecordVersion term + * @return The RelUpdatedRecordVersion Term + */ + public static ComparableTerm getRelUpdatedRecordVersionTerm() { + return RichObjectFactory.getDefaultOntology().getOrCreateTerm("RelUpdatedRecordVersion"); + } + + /** + * Getter for the EMBL term + * @return The EMBL Term + */ + public static ComparableTerm getEMBLTerm() { + return RichObjectFactory.getDefaultOntology().getOrCreateTerm("EMBL"); + } + + /** + * Getter for the Ensembl-specific 'genomic' term + * @return The genomic Term + */ + public static ComparableTerm getGenomicTerm() { + return RichObjectFactory.getDefaultOntology().getOrCreateTerm("genomic"); + } + + /** + * Getter for the Ensembl-specific 'versionLine' term + * @return The version line Term + */ + public static ComparableTerm getVersionLineTerm() { + return RichObjectFactory.getDefaultOntology().getOrCreateTerm("versionLine"); + } + + /** + * Getter for the Ensembl-specific 'dataClass' term + * @return The data class Term + */ + public static ComparableTerm getDataClassTerm() { + return RichObjectFactory.getDefaultOntology().getOrCreateTerm("dataClass"); + } + + /** + * Getter for the Ensembl-specific 'organism' term + * @return The organism Term - "ORGANISM_TAG" + * added by xiujuan 2016-1-28 + */ + public static ComparableTerm getOrganismTerm(){ + return RichObjectFactory.getDefaultOntology().getOrCreateTerm("organism"); + } + + /** + * @return The length + * added by xiujuan 2016-1-28 + */ + public static ComparableTerm getLengthTerm(){ + return RichObjectFactory.getDefaultOntology().getOrCreateTerm("length"); + } + } + + /** + * {@inheritDoc} + * A file is in EMBL format if its name contains the word eem or edat, or the first line matches + * the EMBL format for the ID line. + */ + public boolean canRead(File file) throws IOException { + if (readableFileNames.matcher(file.getName()).matches()) return true; + BufferedReader br = new BufferedReader(new FileReader(file)); + String firstLine = br.readLine(); + boolean readable = firstLine!=null && headerLine.matcher(firstLine).matches() && + (lp.matcher(firstLine.substring(3).trim()).matches() || + lpPre87.matcher(firstLine.substring(3).trim()).matches() + ); + br.close(); + return readable; + } + + /** + * {@inheritDoc} + * Always returns a DNA tokenizer. + */ + public SymbolTokenization guessSymbolTokenization(File file) throws IOException { + return RichSequence.IOTools.getDNAParser(); + } + + /** + * {@inheritDoc} + * A stream is in EMBL format if its first line matches the EMBL format for the ID line. + */ + public boolean canRead(BufferedInputStream stream) throws IOException { + stream.mark(2000); // some streams may not support this + BufferedReader br = new BufferedReader(new InputStreamReader(stream)); + String firstLine = br.readLine(); + boolean readable = firstLine!=null && headerLine.matcher(firstLine).matches() && + (lp.matcher(firstLine.substring(3).trim()).matches() || + lpPre87.matcher(firstLine.substring(3).trim()).matches() + ); + // don't close the reader as it'll close the stream too. + // br.close(); + stream.reset(); + return readable; + } + + /** + * {@inheritDoc} + * Always returns a DNA tokenizer. + */ + public SymbolTokenization guessSymbolTokenization(BufferedInputStream stream) throws IOException { + return RichSequence.IOTools.getDNAParser(); + } + + /** + * {@inheritDoc} + */ + public boolean readSequence(BufferedReader reader, + SymbolTokenization symParser, + SeqIOListener listener) + throws IllegalSymbolException, IOException, ParseException { + if (!(listener instanceof RichSeqIOListener)) throw new IllegalArgumentException("Only accepting RichSeqIOListeners today"); + return this.readRichSequence(reader,symParser,(RichSeqIOListener)listener,null); + } + + /** + * {@inheritDoc} + */ + public boolean readRichSequence(BufferedReader reader, + SymbolTokenization symParser, + RichSeqIOListener rlistener, + Namespace ns) + throws IllegalSymbolException, IOException, ParseException { + tax = null; + organism = null; + accession = null; + boolean hasAnotherSequence = true; + //boolean hasInternalWhitespace = false; + + rlistener.startSequence(); + + if (ns==null) ns=RichObjectFactory.getDefaultNamespace(); + rlistener.setNamespace(ns); + + // Get an ordered list of key->value pairs in array-tuples + String sectionKey = null; + do { + List section = this.readSection(reader); + sectionKey = ((String[])section.get(0))[0]; + if(sectionKey == null){ + + String message = ParseException.newMessage(this.getClass(), accession, "No section key", "Not set", sectionToString(section)); + System.err.println("error happens: " + message); + throw new ParseException(message); + } + // process section-by-section + if (sectionKey.equals(LOCUS_TAG)) { + // entryname dataclass; [circular] molecule; division; sequencelength BP. + String loc = ((String[])section.get(0))[1]; + Matcher m = lp.matcher(loc); + Matcher mPre87 = lpPre87.matcher(loc); + if (m.matches()) { + // first token is both name and primary accession + rlistener.setName(m.group(1)); + rlistener.setAccession(m.group(1)); + // second token is version + rlistener.setVersion(Integer.parseInt(m.group(2))); + // third token is circular/linear + rlistener.setCircular(m.group(3).equals("circular")); + // fourth token is moltype + rlistener.addSequenceProperty(Terms.getMolTypeTerm(),m.group(4)); + // fifth token is data class + rlistener.addSequenceProperty(Terms.getDataClassTerm(),m.group(5)); + // sixth token is taxonomic division + rlistener.setDivision(m.group(6)); + // seventh token is sequence length, which is ignored + // as it is calculated from the sequence data later. + } else if (mPre87.matches()) { + rlistener.setName(mPre87.group(1)); + if (mPre87.group(3)!=null) { + // add annotation for 'genomic' (Ensembl-specific term) + rlistener.addSequenceProperty(Terms.getGenomicTerm(),null); + } + rlistener.addSequenceProperty(Terms.getMolTypeTerm(),mPre87.group(4)); + rlistener.setDivision(mPre87.group(5)); + // Optional extras + String circular = mPre87.group(2); + if (circular!=null) rlistener.setCircular(true); + } else { + String message = ParseException.newMessage(this.getClass(),accession,"Not Set","Bad ID line found", sectionToString(section)); + System.err.println("error happens: " + message); + throw new ParseException(message); + } + } else if (sectionKey.equals(DEFINITION_TAG)) { + rlistener.setDescription(((String[])section.get(0))[1]); + } else if (sectionKey.equals(SOURCE_TAG)) { + // only interested in organelle sub-tag + for (int i = 1; i < section.size(); i++) { + sectionKey = ((String[])section.get(i))[0]; + if (sectionKey.equals(ORGANELLE_TAG)) { + rlistener.addSequenceProperty(Terms.getOrganelleTerm(), ((String[])section.get(i))[1].trim()); + break; // skip out of for loop once found + } + if(sectionKey.equals(ORGANISM_TAG)){ + rlistener.addSequenceProperty(Terms.getOrganismTerm(), ((String[])section.get(i))[1].trim()); + break; + } + } + } else if (sectionKey.equals(DATE_TAG)) { + String chunk = ((String[])section.get(0))[1].trim(); + Matcher dm = dp.matcher(chunk); + if (dm.matches()) { + String date = dm.group(1); + String rel = dm.group(3); + String type = dm.group(4); + if (type.equals("Created")) { + rlistener.addSequenceProperty(Terms.getDateCreatedTerm(), date); + rlistener.addSequenceProperty(Terms.getRelCreatedTerm(), rel); + } else if (type.equals("Last updated, Version ")) { + rlistener.addSequenceProperty(Terms.getDateUpdatedTerm(), date); + rlistener.addSequenceProperty(Terms.getRelUpdatedTerm(), rel); + rlistener.addSequenceProperty(Terms.getRelUpdatedRecordVersionTerm(), dm.group(5)); + } else { + String message = ParseException.newMessage(this.getClass(),accession,"not set", "Bad date type found",sectionToString(section)); + System.err.println("error happens: " + message); + throw new ParseException(message); + } + } else { + String message = ParseException.newMessage(this.getClass(),accession,"not set", "Bad date line found",sectionToString(section)); + System.err.println("error happens: " + message); + throw new ParseException(message); + + } + } else if (sectionKey.equals(ACCESSION_TAG)) { + // if multiple accessions, store only first as accession, + // and store rest in annotation + String[] accs = ((String[])section.get(0))[1].split(";"); + accession = accs[0].trim(); + rlistener.setAccession(accession); + for (int i = 1; i < accs.length; i++) { + rlistener.addSequenceProperty(Terms.getAdditionalAccessionTerm(),accs[i].trim()); + } + } else if (sectionKey.equals(VERSION_TAG)) { + String ver = ((String[])section.get(0))[1]; + Matcher m = vp.matcher(ver); + if (m.matches()) { + String verAcc = m.group(1); + if (!accession.equals(verAcc)) { + // the version refers to a different accession! + // believe the version line, and store the original + // accession away in the additional accession set + rlistener.addSequenceProperty(Terms.getAdditionalAccessionTerm(),accession); + accession = verAcc; + rlistener.setAccession(accession); + } + rlistener.setVersion(Integer.parseInt(m.group(2))); + } else { + rlistener.addSequenceProperty(Terms.getVersionLineTerm(),ver); + } + } else if (sectionKey.equals(KEYWORDS_TAG)) { + String val = ((String[])section.get(0))[1]; + val = val.substring(0,val.length()-1); // chomp dot + val = val.replace('\n',' '); //remove newline + String[] kws = val.split(";"); + for (int i = 0; i < kws.length; i++) { + String kw = kws[i].trim(); + if (kw.length()==0) continue; + rlistener.addSequenceProperty(Terms.getKeywordTerm(), kw); + } + } else if (sectionKey.equals(DATABASE_XREF_TAG)) { + String val = ((String[])section.get(0))[1]; + val = val.substring(0,val.length()-1); // chomp dot + // database_identifier; primary_identifier; secondary_identifier.... + String[] parts = val.split(";"); + // construct a DBXREF out of the dbname part[0] and accession part[1] + CrossRef crossRef = (CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{parts[0].trim(),parts[1].trim(), new Integer(0)}); + // assign remaining bits of info as annotations + for (int j = 2; j < parts.length; j++) { + Note note = new SimpleNote(Terms.getAdditionalAccessionTerm(),parts[j].trim(),j-1); + try { + crossRef.getRichAnnotation().addNote(note); + } catch (ChangeVetoException ce) { + String message = ParseException.newMessage(this.getClass(),accession,"not set", "Could not annotate identifier terms",sectionToString(section)); + ParseException pe = new ParseException(message); + System.err.println("error happens: " + message); + pe.initCause(ce); + throw pe; + } + } + RankedCrossRef rcrossRef = new SimpleRankedCrossRef(crossRef, 0); + rlistener.setRankedCrossRef(rcrossRef); + } else if (sectionKey.equals(REFERENCE_TAG) && !this.getElideReferences()) { + // first line of section has rank and location + String refrank = ((String[])section.get(0))[1]; + int ref_rank = Integer.parseInt(refrank.substring(1,refrank.length()-1)); + int ref_start = -999; + int ref_end = -999; + // rest can be in any order + String consortium = null; + String authors = ""; + String title = null; + String locator = null; + String pubmed = null; + String medline = null; + String doi = null; + String remark = null; + for (int i = 1; i < section.size(); i++) { + String key = ((String[])section.get(i))[0]; + String val = ((String[])section.get(i))[1]; + if (key.equals(AUTHORS_TAG)) { + if (val.endsWith(";")) val = val.substring(0,val.length()-1); // chomp semicolon + authors = val.replace('\n',' '); //see #2276 + }else if (key.equals(CONSORTIUM_TAG)) { + if (val.endsWith(";")) val = val.substring(0,val.length()-1); // chomp semicolon + consortium = val.replace('\n',' '); //see #2276 + }else if (key.equals(TITLE_TAG)) { + if (val.length()>1) { + if (val.endsWith(";")) val = val.substring(0,val.length()-1); // chomp semicolon + if (val.endsWith("\"")) val = val.substring(1,val.length()-1); // chomp quotes + title = val.replace('\n',' '); //see #2276 + } else title=null; // single semi-colon indicates no title + }else if (key.equals(LOCATOR_TAG)) { + if (val.endsWith(".")) val = val.substring(0,val.length()-1); // chomp dot + locator = val.replace('\n',' '); //see #2276 + }else if (key.equals(REFERENCE_XREF_TAG)) { + // database_identifier; primary_identifier. + String[] refs = val.split("\\.(\\s+|$)"); + for (int j = 0 ; j < refs.length; j++) { + if (refs[j].trim().length()==0) continue; + String[] parts = refs[j].split(";"); + if(parts.length == 2){ + String db = parts[0]; + String ref = parts[1].trim(); + if (db.equalsIgnoreCase(Terms.PUBMED_KEY)) pubmed = ref; + else if (db.equalsIgnoreCase(Terms.MEDLINE_KEY)) medline = ref; + else if (db.equalsIgnoreCase(Terms.DOI_KEY)) doi = ref; + } + } + }else if (key.equals(REMARK_TAG)) remark = val.replace('\n',' '); //see #2276 + else if (key.equals(REFERENCE_POSITION_TAG)) { + // only the first group is taken + // if we have multiple lines, only the last line is taken + Matcher m = rpp.matcher(val); + if (m.matches()) { + ref_start = Integer.parseInt(m.group(1)); + if(m.group(2) != null) + ref_end = Integer.parseInt(m.group(3)); + } else { + String message = ParseException.newMessage(this.getClass(),accession,"not set", "Bad reference line found",sectionToString(section)); + System.err.println("error happens: " + message); + throw new ParseException(message); + } + } + } + // create the docref object + try { + List authSet = DocRefAuthor.Tools.parseAuthorString(authors); + if (consortium!=null) authSet.add(new SimpleDocRefAuthor(consortium, true, false)); + DocRef dr = (DocRef)RichObjectFactory.getObject(SimpleDocRef.class,new Object[]{authSet,locator,title}); + // assign either the pubmed or medline to the docref - medline gets priority, then pubmed, then doi + if (medline!=null) dr.setCrossref((CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{Terms.MEDLINE_KEY, medline, new Integer(0)})); + else if (pubmed!=null) dr.setCrossref((CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{Terms.PUBMED_KEY, pubmed, new Integer(0)})); + else if (doi!=null) dr.setCrossref((CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{Terms.DOI_KEY, doi, new Integer(0)})); + // assign the remarks + if (!this.getElideComments()) dr.setRemark(remark); + // assign the docref to the bioentry + RankedDocRef rdr = new SimpleRankedDocRef(dr, + (ref_start != -999 ? new Integer(ref_start) : null), + (ref_end != -999 ? new Integer(ref_end) : null), + ref_rank); + rlistener.setRankedDocRef(rdr); + rlistener.setRankedDocRef(rdr); + } catch (ChangeVetoException e) { + String message = ParseException.newMessage(this.getClass(),accession,"not set", "",sectionToString(section)); + System.err.println("error happens: " + message); + throw new ParseException(e, message); + } + } else if (sectionKey.equals(COMMENT_TAG) && !this.getElideComments()) { + // Set up some comments + rlistener.setComment(((String[])section.get(0))[1]); + } else if (sectionKey.equals(FEATURE_TAG) && !this.getElideFeatures()) { + // starting from second line of input, start a new feature whenever we come across + // a key that does not start with / + boolean seenAFeature = false; + int rcrossrefCount = 0; + boolean skippingBond = false; + for (int i = 1 ; i < section.size(); i++) { + String key = ((String[])section.get(i))[0]; + String val = ((String[])section.get(i))[1]; + if (key.startsWith("/")) { + if(!skippingBond){ + key = key.substring(1); // strip leading slash + val = val.replaceAll("\\s*[\\n\\r]+\\s*"," ").trim(); + if (val.startsWith("\"")) val = val.substring(1,val.length()-1); // strip quotes + // parameter on old feature + if (key.equalsIgnoreCase("db_xref")) { + Matcher m = dbxp.matcher(val); + if (m.matches()) { + String dbname = m.group(1); + String raccession = m.group(2); + if (dbname.equalsIgnoreCase("taxon")) { + // Set the Taxon instead of a dbxref + tax = (NCBITaxon)RichObjectFactory.getObject(SimpleNCBITaxon.class, new Object[]{Integer.valueOf(raccession)}); + rlistener.setTaxon(tax); + try { + if (organism!=null) tax.addName(NCBITaxon.SCIENTIFIC,organism); + } catch (ChangeVetoException e) { + String message = ParseException.newMessage(this.getClass(),accession,"not set", "",sectionToString(section)); + System.err.println("error happens: " + message); + throw new ParseException(e, message); + } + } else { + try { + CrossRef cr = (CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{dbname, raccession, new Integer(0)}); + RankedCrossRef rcr = new SimpleRankedCrossRef(cr, ++rcrossrefCount); + rlistener.getCurrentFeature().addRankedCrossRef(rcr); + } catch (ChangeVetoException e) { + String message = ParseException.newMessage(this.getClass(),accession,"not set", "",sectionToString(section)); + System.err.println("error happens: " + message); + throw new ParseException(e, message); + } + } + } else { + String message = ParseException.newMessage(this.getClass(),accession,"not set", "Bad dbxref found",sectionToString(section)); + System.err.println("error happens: " + message); + throw new ParseException(message); + } + } else if (key.equalsIgnoreCase("organism")) { + try { + organism = val; + if (tax!=null) tax.addName(NCBITaxon.SCIENTIFIC,organism); + } catch (ChangeVetoException e) { + String message = ParseException.newMessage(this.getClass(),accession,"not set", "",sectionToString(section)); + System.err.println("error happens: " + message); + throw new ParseException(message); + } + } else { + if (key.equalsIgnoreCase("translation")) { + // strip spaces from sequence + val = val.replaceAll("\\s+",""); + } + rlistener.addFeatureProperty(RichObjectFactory.getDefaultOntology().getOrCreateTerm(key),val); + } + } + } else { + // new feature! + // end previous feature + if(key.equalsIgnoreCase("bond")) + { + skippingBond = true; + }else{ + skippingBond = false; + if (seenAFeature) { + rlistener.endFeature(); + } + // start next one, with lots of lovely info in it + RichFeature.Template templ = new RichFeature.Template(); + templ.annotation = new SimpleRichAnnotation(); + templ.sourceTerm = Terms.getEMBLTerm(); + templ.typeTerm = RichObjectFactory.getDefaultOntology().getOrCreateTerm(key); + templ.featureRelationshipSet = new TreeSet(); + templ.rankedCrossRefs = new TreeSet(); + String tidyLocStr = val.replaceAll("\\s+",""); + templ.location = GenbankLocationParser.parseLocation(ns, accession, tidyLocStr); + if(!(templ.location instanceof MultiSourceCompoundRichLocation)){ + rlistener.startFeature(templ); + seenAFeature = true; + rcrossrefCount = 0; + }else{ + System.err.println("encounter a MultiSourceCompoundRichLocation instance"); + skippingBond = true; + seenAFeature = false; + } + } + } + } + if (seenAFeature) rlistener.endFeature(); + } else if (sectionKey.equals(START_SEQUENCE_TAG) && !this.getElideSymbols()) { + StringBuffer seq = new StringBuffer(); + for (int i = 0 ; i < section.size(); i++) seq.append(((String[])section.get(i))[1]); + try { + SymbolList sl = new SimpleSymbolList(symParser, + seq.toString().replaceAll("\\s+","").replaceAll("[\\.|~]","-")); + rlistener.addSymbols(symParser.getAlphabet(), + (Symbol[])(sl.toList().toArray(new Symbol[0])), + 0, sl.length()); + } catch (Exception e) { + String message = ParseException.newMessage(this.getClass(),accession,"not set", "Bad sequence",sectionToString(section)); + System.err.println("error happens: " + message); + throw new ParseException(e, message); + } + } + } while (!sectionKey.equals(END_SEQUENCE_TAG)); + + // Allows us to tolerate trailing whitespace without + // thinking that there is another Sequence to follow + while (true) { + reader.mark(1); + int c = reader.read(); + if (c == -1) { + hasAnotherSequence = false; + break; + } + if (Character.isWhitespace((char) c)) { + //hasInternalWhitespace = true; + continue; + } + //if (hasInternalWhitespace) + // System.err.println("Warning: whitespace found between sequence entries"); + reader.reset(); + break; + } + + // Finish up. + rlistener.endSequence(); + return hasAnotherSequence; + } + + // reads an indented section, combining split lines and creating a list of key->value tuples + private List readSection(BufferedReader br) throws ParseException { + List section = new ArrayList(); + String line; + boolean done = false; + + // while not done + try { + while (!done) { + // mark buffer + br.mark(320); + // read token + line = br.readLine(); + if (line.length()<2) { + String message = ParseException.newMessage(this.getClass(),accession,"not set", "Bad line found",line); + System.err.println("error happens: " + message); + throw new ParseException(message); + } + String token = line.substring(0,2); + // READ SEQUENCE SECTION + if (token.equals(START_SEQUENCE_TAG)) { + // from next line, read sequence until // - leave // on stack + StringBuffer sb = new StringBuffer(); + while (!done) { + br.mark(160); + line = br.readLine(); + if (line.startsWith(END_SEQUENCE_TAG)) { + br.reset(); + done = true; + } else { + // create sequence tag->value pair to return, sans numbers + sb.append(line.replaceAll("\\d","")); + } + } + section.add(new String[]{START_SEQUENCE_TAG,sb.toString()}); + } + // READ FEATURE TABLE SECTION + else if (token.equals(FEATURE_HEADER_TAG)) { + // create dummy feature tag->value pair and add to return set + section.add(new String[]{FEATURE_TAG,null}); + // drop next FH line + line = br.readLine(); // skip next line too - it is also FH + // read all FT lines until XX + String currentTag = null; + StringBuffer currentVal = null; + while (!done) { + line = br.readLine(); + if (line.startsWith(DELIMITER_TAG)) { + done = true; + // dump current tag if exists + if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()}); + } else { + // FT lines: FT word value + // or FT /word + // or FT /db_xref="taxon:3899.... + // ......" + line = line.substring(5); // chomp off "FT " + if (!line.startsWith(" ")) { + // dump current tag if exists + if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()}); + // case 1 : word value - splits into key-value on its own + String[] parts = line.trim().split("\\s+"); + currentTag = parts[0]; + currentVal = new StringBuffer(); + currentVal.append(parts[1]); + } else { + line = line.trim(); + if (line.startsWith("/")) { + // dump current tag if exists + if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()}); + // case 2 : /word[=.....] + currentVal = new StringBuffer(); + int equalIndex = line.indexOf('='); + if (equalIndex>=0) { + currentTag = line.substring(0, equalIndex); + currentVal.append(line.substring(equalIndex+1)); + } else { + currentTag = line; + } + } else { + // case 3 : ...." + currentVal.append("\n"); + currentVal.append(line); + } + } + } + } + } + // READ END OF SEQUENCE + else if (token.equals(END_SEQUENCE_TAG)) { + section.add(new String[]{END_SEQUENCE_TAG,null}); + done = true; + } + // READ DELIMITER TAG + else if (token.equals(DELIMITER_TAG)) { + section.add(new String[]{DELIMITER_TAG,null}); + done = true; + } + // READ THIRD PARTY ANNOTATION SECTION + else if (token.equals(TPA_TAG)) { + // exception = don't know how to do TPA yet + // TODO: 2016/6/27 run into here with accession BK000583, HE580237 + /*String message = ParseException.newMessage(this.getClass(),accession,"not set", "Unable to handle TPAs just yet",sectionToString(section)); + System.err.println("error happens: " + message); + throw new ParseException(message);*/ + section.add(new String[]{TPA_TAG, null}); + done = true; + } + // READ CONTIG SECTION + //else if (token.equals(CONTIG_TAG)) { + // exception = don't know how to do contigs yet + //String message = ParseException.newMessage(this.getClass(),accession,"not set", "Unable to handle contig assemblies just yet",sectionToString(section)); + //throw new ParseException(message); + //} + //2016.1.27 modified by Xiujuan for parsing file, file containing CONTIG_TAG + else if (token.equals(CONTIG_TAG)) { + section.add(new String[]{CONTIG_TAG,null}); + done = true; + } + // READ DOCREF + else if (token.equals(DATABASE_XREF_TAG)) { + section.add(new String[]{DATABASE_XREF_TAG,line.substring(5).trim()}); + done = true; + } + // READ DATE + else if (token.equals(DATE_TAG)) { + section.add(new String[]{DATE_TAG,line.substring(5).trim()}); + done = true; + } + // READ NORMAL TAG/VALUE SECTION + else { + // rewind buffer to mark + br.reset(); + // read token/values until XX + String currentTag = null; + StringBuffer currentVal = null; + while (!done) { + line = br.readLine(); + if (line.startsWith(DELIMITER_TAG)) { + done = true; + // dump current tag if exists + if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()}); + } else { + try { + // merge neighbouring repeated tokens by concatting values + // return tag->value pairs + String tag = line.substring(0,2); + String value = line.substring(5); + if (currentTag==null || !tag.equals(currentTag)) { + // dump current tag if exists + if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()}); + // start new tag + currentTag = tag; + currentVal = new StringBuffer(); + currentVal.append(value); + } else { + currentVal.append("\n"); + currentVal.append(value); + } + } catch (Exception e) { + String message = ParseException.newMessage(this.getClass(), accession, "not set","",sectionToString(section)); + System.err.println("error happens: " + message); + throw new ParseException(e, message); + } + } + } + } + } + } catch (IOException e) { + String message = ParseException.newMessage(this.getClass(),accession,"not set", "Unable to handle TPAs just yet",sectionToString(section)); + System.err.println("error happens: " + message); + throw new ParseException(message); + } + return section; + } + + /** + * {@inheritDoc} + */ + public void writeSequence(Sequence seq, PrintStream os) throws IOException { + if (this.getPrintStream()==null) this.setPrintStream(os); + this.writeSequence(seq, RichObjectFactory.getDefaultNamespace()); + } + + /** + * {@inheritDoc} + */ + public void writeSequence(Sequence seq, String format, PrintStream os) throws IOException { + if (this.getPrintStream()==null) this.setPrintStream(os); + this.writeSequence(seq, format, RichObjectFactory.getDefaultNamespace()); + } + + /** + * {@inheritDoc} + * Namespace is ignored as EMBL has no concept of it. + */ + public void writeSequence(Sequence seq, Namespace ns) throws IOException { + this.writeSequence(seq, this.getDefaultFormat(), ns); + } + + /** + * As per {@link #writeSequence(Sequence, Namespace)}, except + * that it also takes a format parameter. This can be any of the formats + * defined as constants in this class. + * @param seq see {@link #writeSequence(Sequence, Namespace)} + * @param format the format to use. + * @param ns see {@link #writeSequence(Sequence, Namespace)} + * @throws IOException see {@link #writeSequence(Sequence, Namespace)} + */ + public void writeSequence(Sequence seq, String format, Namespace ns) throws IOException { + if (!format.equals(EMBL_FORMAT) && !format.equals(EMBL_PRE87_FORMAT)) + throw new IllegalArgumentException("Format "+format+" not recognised."); + + RichSequence rs; + try { + if (seq instanceof RichSequence) rs = (RichSequence)seq; + else rs = RichSequence.Tools.enrich(seq); + } catch (ChangeVetoException e) { + IOException e2 = new IOException("Unable to enrich sequence"); + e2.initCause(e); + throw e2; + } + + SymbolTokenization tok; + try { + tok = rs.getAlphabet().getTokenization("token"); + } catch (Exception e) { + throw new RuntimeException("Unable to get alphabet tokenizer",e); + } + + Set notes = rs.getNoteSet(); + String accession = rs.getAccession(); + StringBuffer accessions = new StringBuffer(); + accessions.append(accession); + accessions.append(";"); + String cdat = null; + String udat = null; + String crel = null; + String urel = null; + String urecv = null; + String organelle = null; + String versionLine = null; + String dataClass = "STD"; + boolean genomic = false; + String moltype = rs.getAlphabet().getName(); + for (Iterator i = notes.iterator(); i.hasNext(); ) { + Note n = i.next(); + if (n.getTerm().equals(Terms.getDateCreatedTerm())) cdat=n.getValue(); + else if (n.getTerm().equals(Terms.getDateUpdatedTerm())) udat=n.getValue(); + else if (n.getTerm().equals(Terms.getRelCreatedTerm())) crel=n.getValue(); + else if (n.getTerm().equals(Terms.getRelUpdatedTerm())) urel=n.getValue(); + else if (n.getTerm().equals(Terms.getRelUpdatedRecordVersionTerm())) urecv=n.getValue(); + else if (n.getTerm().equals(Terms.getMolTypeTerm())) moltype=n.getValue(); + else if (n.getTerm().equals(Terms.getVersionLineTerm())) versionLine=n.getValue(); + else if (n.getTerm().equals(Terms.getGenomicTerm())) genomic = true; + else if (n.getTerm().equals(Terms.getDataClassTerm())) dataClass = n.getValue(); + else if (n.getTerm().equals(Terms.getAdditionalAccessionTerm())) { + accessions.append(" "); + accessions.append(n.getValue()); + accessions.append(";"); + } else if (n.getTerm().equals(Terms.getOrganelleTerm())) organelle=n.getValue(); + } + + StringBuffer locusLine = new StringBuffer(); + // Division cannot be null + String div = rs.getDivision(); + if(div==null || div.length()==0 || div.length()>3) + div = "UNC"; //Unclassified + + if (format.equals(EMBL_FORMAT)) { + // accession; SV version; circular/linear; moltype; dataclass; division; length BP. + locusLine.append(rs.getAccession()); + locusLine.append("; SV "); + locusLine.append(rs.getVersion()); + locusLine.append("; "); + locusLine.append(rs.getCircular()?"circular":"linear"); + locusLine.append("; "); + locusLine.append(moltype); + locusLine.append("; "); + locusLine.append(dataClass); + locusLine.append("; "); + locusLine.append(div); + locusLine.append("; "); + locusLine.append(rs.length()); + locusLine.append(" BP."); + } else if (format.equals(EMBL_PRE87_FORMAT)) { + // entryname dataclass; [circular] molecule; division; sequencelength BP. + locusLine.append(StringTools.rightPad(rs.getName(), 9)); + locusLine.append(" standard; "); + locusLine.append(rs.getCircular()?"circular ":""); + // if it is Ensembl genomic, add that in too + if (genomic==true) locusLine.append("genomic "); + locusLine.append(moltype); + locusLine.append("; "); + locusLine.append(div); + locusLine.append("; "); + locusLine.append(rs.length()); + locusLine.append(" BP."); + } + StringTools.writeKeyValueLine(LOCUS_TAG, locusLine.toString(), 5, this.getLineWidth(), null, LOCUS_TAG, this.getPrintStream()); + this.getPrintStream().println(DELIMITER_TAG+" "); + + // accession line + StringTools.writeKeyValueLine(ACCESSION_TAG, accessions.toString(), 5, this.getLineWidth(), null, ACCESSION_TAG, this.getPrintStream()); + this.getPrintStream().println(DELIMITER_TAG+" "); + + // version line + if (format.equals(EMBL_PRE87_FORMAT)) { + if (versionLine!=null) StringTools.writeKeyValueLine(VERSION_TAG, versionLine, 5, this.getLineWidth(), null, VERSION_TAG, this.getPrintStream()); + else StringTools.writeKeyValueLine(VERSION_TAG, accession+"."+rs.getVersion(), 5, this.getLineWidth(), null, VERSION_TAG, this.getPrintStream()); + this.getPrintStream().println(DELIMITER_TAG+" "); + } + + // date line + StringTools.writeKeyValueLine(DATE_TAG, (cdat==null?udat:cdat)+" (Rel. "+(crel==null?"0":crel)+", Created)", 5, this.getLineWidth(), null, DATE_TAG, this.getPrintStream()); + StringTools.writeKeyValueLine(DATE_TAG, udat+" (Rel. "+(urel==null?"0":urel)+", Last updated, Version "+(urecv==null?"0":urecv)+")", 5, this.getLineWidth(), null, DATE_TAG, this.getPrintStream()); + this.getPrintStream().println(DELIMITER_TAG+" "); + + // definition line + StringTools.writeKeyValueLine(DEFINITION_TAG, rs.getDescription(), 5, this.getLineWidth(), null, DEFINITION_TAG, this.getPrintStream()); + this.getPrintStream().println(DELIMITER_TAG+" "); + + // keywords line + StringBuffer keywords = new StringBuffer(); + for (Iterator n = notes.iterator(); n.hasNext(); ) { + Note nt = n.next(); + if (nt.getTerm().equals(Terms.getKeywordTerm())) { + if (keywords.length()>0) keywords.append("; "); + keywords.append(nt.getValue()); + } + } + if (keywords.length()>0) { + keywords.append("."); + StringTools.writeKeyValueLine(KEYWORDS_TAG, keywords.toString(), 5, this.getLineWidth(), null, KEYWORDS_TAG, this.getPrintStream()); + this.getPrintStream().println(DELIMITER_TAG+" "); + } else { + this.getPrintStream().println(KEYWORDS_TAG+" ."); + this.getPrintStream().println(DELIMITER_TAG+" "); + } + + // source line (from taxon) + // organism line + NCBITaxon tax = rs.getTaxon(); + if (tax!=null) { + StringTools.writeKeyValueLine(SOURCE_TAG, tax.getDisplayName(), 5, this.getLineWidth(), null, SOURCE_TAG, this.getPrintStream()); + StringTools.writeKeyValueLine(ORGANISM_TAG, tax.getNameHierarchy(), 5, this.getLineWidth(), null, SOURCE_TAG, this.getPrintStream()); + if (organelle!=null) StringTools.writeKeyValueLine(ORGANELLE_TAG, organelle, 5, this.getLineWidth(), null, ORGANELLE_TAG, this.getPrintStream()); + this.getPrintStream().println(DELIMITER_TAG+" "); + } + + // references - rank (bases x to y) + for (Iterator r = rs.getRankedDocRefs().iterator(); r.hasNext(); ) { + RankedDocRef rdr = r.next(); + DocRef d = rdr.getDocumentReference(); + // RN, RC, RP, RX, RG, RA, RT, RL + StringTools.writeKeyValueLine(REFERENCE_TAG, "["+rdr.getRank()+"]", 5, this.getLineWidth(), null, REFERENCE_TAG, this.getPrintStream()); + StringTools.writeKeyValueLine(REMARK_TAG, d.getRemark(), 5, this.getLineWidth(), null, REMARK_TAG, this.getPrintStream()); + Integer rstart = rdr.getStart(); + if (rstart==null) rstart = new Integer(1); + Integer rend = rdr.getEnd(); + if (rend==null) rend = new Integer(rs.length()); + StringTools.writeKeyValueLine(REFERENCE_POSITION_TAG, rstart+"-"+rend, 5, this.getLineWidth(), null, REFERENCE_POSITION_TAG, this.getPrintStream()); + CrossRef c = d.getCrossref(); + if (c!=null) StringTools.writeKeyValueLine(REFERENCE_XREF_TAG, c.getDbname()+"; "+c.getAccession()+".", 5, this.getLineWidth(), null, REFERENCE_XREF_TAG, this.getPrintStream()); + List auths = d.getAuthorList(); + for (Iterator j = auths.iterator(); j.hasNext(); ) { + DocRefAuthor a = j.next(); + if (a.isConsortium()) { + StringTools.writeKeyValueLine(CONSORTIUM_TAG, a+";", 5, this.getLineWidth(), null, CONSORTIUM_TAG, this.getPrintStream()); + j.remove(); + } + } + if (!auths.isEmpty()) StringTools.writeKeyValueLine(AUTHORS_TAG, DocRefAuthor.Tools.generateAuthorString(auths, true)+";", 5, this.getLineWidth(), null, AUTHORS_TAG, this.getPrintStream()); + else StringTools.writeKeyValueLine(AUTHORS_TAG, ";", 5, this.getLineWidth(), null, AUTHORS_TAG, this.getPrintStream()); + if (d.getTitle()!=null && d.getTitle().length()!=0) StringTools.writeKeyValueLine(TITLE_TAG, "\""+d.getTitle()+"\";", 5, this.getLineWidth(), null, TITLE_TAG, this.getPrintStream()); + else StringTools.writeKeyValueLine(TITLE_TAG, ";", 5, this.getLineWidth(), null, TITLE_TAG, this.getPrintStream()); + StringTools.writeKeyValueLine(LOCATOR_TAG, d.getLocation()+".", 5, this.getLineWidth(), null, LOCATOR_TAG, this.getPrintStream()); + this.getPrintStream().println(DELIMITER_TAG+" "); + } + + // db references - ranked + for (Iterator r = rs.getRankedCrossRefs().iterator(); r.hasNext(); ) { + RankedCrossRef rcr = r.next(); + CrossRef c = rcr.getCrossRef(); + Set noteset = c.getNoteSet(); + StringBuffer sb = new StringBuffer(); + sb.append(c.getDbname()); + sb.append("; "); + sb.append(c.getAccession()); + boolean hasSecondary = false; + for (Iterator i = noteset.iterator(); i.hasNext(); ) { + Note n = i.next(); + if (n.getTerm().equals(Terms.getAdditionalAccessionTerm())) { + sb.append("; "); + sb.append(n.getValue()); + hasSecondary = true; + } + } + //if (!hasSecondary) sb.append("; -"); + //sb.append("."); + if (!hasSecondary) sb.append(";"); + else sb.append("."); + StringTools.writeKeyValueLine(DATABASE_XREF_TAG, sb.toString(), 5, this.getLineWidth(), null, DATABASE_XREF_TAG, this.getPrintStream()); + } + if (!rs.getRankedCrossRefs().isEmpty()) + this.getPrintStream().println(DELIMITER_TAG+" "); + + // comments - if any + if (!rs.getComments().isEmpty()) { + StringBuffer sb = new StringBuffer(); + for (Iterator i = rs.getComments().iterator(); i.hasNext(); ) { + Comment c = i.next(); + sb.append(c.getComment()); + if (i.hasNext()) sb.append("\n"); + } + StringTools.writeKeyValueLine(COMMENT_TAG, sb.toString(), 5, this.getLineWidth(), null, COMMENT_TAG, this.getPrintStream()); + this.getPrintStream().println(DELIMITER_TAG+" "); + } + + this.getPrintStream().println(FEATURE_HEADER_TAG+" Key Location/Qualifiers"); + this.getPrintStream().println(FEATURE_HEADER_TAG+" "); + // feature_type location + for (Iterator i = rs.getFeatureSet().iterator(); i.hasNext(); ) { + RichFeature f = (RichFeature)i.next(); + StringTools.writeKeyValueLine(FEATURE_TAG+" "+f.getTypeTerm().getName(), GenbankLocationParser.writeLocation((RichLocation)f.getLocation()), 21, this.getLineWidth(), ",", FEATURE_TAG, this.getPrintStream()); + for (Iterator j = f.getNoteSet().iterator(); j.hasNext(); ) { + Note n = j.next(); + // /key="val" or just /key if val=="" + if (n.getValue()==null || n.getValue().length()==0) StringTools.writeKeyValueLine(FEATURE_TAG, "/"+n.getTerm().getName(), 21, this.getLineWidth(), null, FEATURE_TAG, this.getPrintStream()); + else StringTools.writeKeyValueLine(FEATURE_TAG, "/"+n.getTerm().getName()+"=\""+n.getValue()+"\"", 21, this.getLineWidth(), null, FEATURE_TAG, this.getPrintStream()); + } + // add-in to source feature only organism and db_xref="taxon:xyz" where present + if (f.getType().equals("source") && tax!=null) { + String displayName = tax.getDisplayName(); + if (displayName.indexOf('(')>-1) displayName = displayName.substring(0, displayName.indexOf('(')).trim(); + StringTools.writeKeyValueLine(FEATURE_TAG, "/organism=\""+displayName+"\"", 21, this.getLineWidth(), null, FEATURE_TAG, this.getPrintStream()); + StringTools.writeKeyValueLine(FEATURE_TAG, "/db_xref=\"taxon:"+tax.getNCBITaxID()+"\"", 21, this.getLineWidth(), null, FEATURE_TAG, this.getPrintStream()); + } + // add-in other dbxrefs where present + for (Iterator j = f.getRankedCrossRefs().iterator(); j.hasNext(); ) { + RankedCrossRef rcr = j.next(); + CrossRef cr = rcr.getCrossRef(); + StringTools.writeKeyValueLine(FEATURE_TAG, "/db_xref=\""+cr.getDbname()+":"+cr.getAccession()+"\"", 21, this.getLineWidth(), null, FEATURE_TAG, this.getPrintStream()); + } + } + this.getPrintStream().println(DELIMITER_TAG+" "); + + // SQ Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other; + int aCount = 0; + int cCount = 0; + int gCount = 0; + int tCount = 0; + int oCount = 0; + for (int i = 1; i <= rs.length(); i++) { + char c; + try { + c = tok.tokenizeSymbol(rs.symbolAt(i)).charAt(0); + } catch (Exception e) { + throw new RuntimeException("Unable to get symbol at position "+i,e); + } + switch (c) { + case 'a': case 'A': + aCount++; + break; + case 'c': case 'C': + cCount++; + break; + case 'g': case 'G': + gCount++; + break; + case 't': case 'T': + tCount++; + break; + default: + oCount++; + } + } + this.getPrintStream().print(START_SEQUENCE_TAG+" Sequence "+rs.length()+" BP; "); + this.getPrintStream().print(aCount + " A; "); + this.getPrintStream().print(cCount + " C; "); + this.getPrintStream().print(gCount + " G; "); + this.getPrintStream().print(tCount + " T; "); + this.getPrintStream().println(oCount + " other;"); + + // sequence stuff + Symbol[] syms = (Symbol[])rs.toList().toArray(new Symbol[0]); + int lineLen = 0; + int symCount = 0; + this.getPrintStream().print(" "); + for (int i = 0; i < syms.length; i++) { + if (symCount % 60 == 0 && symCount>0) { + this.getPrintStream().print(StringTools.leftPad(""+symCount,10)); + this.getPrintStream().print("\n "); + lineLen = 0; + } + if (symCount % 10 == 0) { + this.getPrintStream().print(" "); + lineLen++; + } + try { + this.getPrintStream().print(tok.tokenizeSymbol(syms[i])); + } catch (IllegalSymbolException e) { + throw new RuntimeException("Found illegal symbol: "+syms[i]); + } + symCount++; + lineLen++; + } + this.getPrintStream().print(StringTools.leftPad(""+symCount,(66-lineLen)+10)); + this.getPrintStream().print("\n"); + this.getPrintStream().println(END_SEQUENCE_TAG); + } + + /** + * {@inheritDoc} + */ + public String getDefaultFormat() { + return EMBL_FORMAT; + } + + + /** + * Converts the current parse section to a String. Useful for debugging. + */ + String sectionToString(List section){ + StringBuffer parseBlock = new StringBuffer(); + for(Iterator i = section.listIterator(); i.hasNext();){ + String[] part = (String[])i.next(); + for(int x = 0; x < part.length; x++){ + parseBlock.append(part[x]); + if(x == 0){ + parseBlock.append(" "); //the gap will have been trimmed + } + } + } + return parseBlock.toString(); + } +} diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/util/CustomEnsemblFormat.java b/piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/util/CustomEnsemblFormat.java new file mode 100644 index 0000000..c65eb63 --- /dev/null +++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/util/CustomEnsemblFormat.java @@ -0,0 +1,1133 @@ +package cn.piflow.bundle.microorganism.util; + +import org.biojava.bio.seq.Sequence; +import org.biojava.bio.seq.io.ParseException; +import org.biojava.bio.seq.io.SeqIOListener; +import org.biojava.bio.seq.io.SymbolTokenization; +import org.biojava.bio.symbol.IllegalSymbolException; +import org.biojava.bio.symbol.Symbol; +import org.biojava.utils.ChangeVetoException; +import org.biojavax.*; +import org.biojavax.bio.seq.RichFeature; +import org.biojavax.bio.seq.RichLocation; +import org.biojavax.bio.seq.RichSequence; +import org.biojavax.bio.seq.io.GenbankLocationParser; +import org.biojavax.bio.seq.io.RichSeqIOListener; +import org.biojavax.bio.seq.io.RichSequenceFormat; +import org.biojavax.bio.taxa.NCBITaxon; +import org.biojavax.bio.taxa.SimpleNCBITaxon; +import org.biojavax.ontology.ComparableTerm; +import org.biojavax.utils.StringTools; + +import java.io.*; +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Created by xiujuan on 2016/1/27. + */ +public class CustomEnsemblFormat extends RichSequenceFormat.HeaderlessFormat { + // Register this format with the format auto-guesser. + static { + RichSequence.IOTools.registerFormat(CustomEMBLFormat.class); + } + + /** + * The name of the Pre-87 format + */ + public static final String EMBL_PRE87_FORMAT = "EMBL_PRE87"; + + /** + * The name of the current format + */ + public static final String EMBL_FORMAT = "EMBL"; + + protected static final String LOCUS_TAG = "ID"; + protected static final String ACCESSION_TAG = "AC"; + protected static final String VERSION_TAG = "SV"; + protected static final String DEFINITION_TAG = "DE"; + protected static final String DATE_TAG = "DT"; + protected static final String DATABASE_XREF_TAG = "DR"; + protected static final String SOURCE_TAG = "OS"; + protected static final String ORGANISM_TAG = "OC"; + protected static final String ORGANELLE_TAG = "OG"; + protected static final String REFERENCE_TAG = "RN"; + protected static final String REFERENCE_POSITION_TAG = "RP"; + protected static final String REFERENCE_XREF_TAG = "RX"; + protected static final String AUTHORS_TAG = "RA"; + protected static final String CONSORTIUM_TAG = "RG"; + protected static final String TITLE_TAG = "RT"; + protected static final String LOCATOR_TAG = "RL"; + protected static final String REMARK_TAG = "RC"; + protected static final String KEYWORDS_TAG = "KW"; + protected static final String COMMENT_TAG = "CC"; + protected static final String FEATURE_HEADER_TAG = "FH"; + protected static final String FEATURE_TAG = "FT"; + protected static final String CONTIG_TAG = "CO"; + protected static final String TPA_TAG = "AH"; + protected static final String START_SEQUENCE_TAG = "SQ"; + protected static final String DELIMITER_TAG = "XX"; + protected static final String END_SEQUENCE_TAG = "//"; + + // the date pattern Ensembl file + protected static final Pattern dp_ensembl = Pattern.compile("([^\\s]+)"); + // the date pattern + // date (Rel. N, Created) + // date (Rel. N, Last updated, Version M) + protected static final Pattern dp = Pattern.compile("([^\\s]+)\\s*(\\(Rel\\.\\s+(\\d+), ([^\\)\\d]+)(\\d*)\\))?$"); + // locus line + protected static final Pattern lp = Pattern.compile("^(\\S+);\\s+SV\\s+(\\d+);\\s+(linear|circular);\\s+(\\S+\\s?\\S+?);\\s+(\\S+);\\s+(\\S+);\\s+(\\d+)\\s+(BP|AA)\\.$"); + protected static final Pattern lpPre87 = Pattern.compile("^(\\S+)\\s+standard;\\s+(circular)?\\s*(genomic)?\\s*(\\S+);\\s+(\\S+);\\s+(\\d+)\\s+BP\\.$"); + //protected static final Pattern ensembl_id = Pattern.compile("^\\S+\\s+\\S+;\\s+\\S+;\\s+\\S+;\\s+(\\d+)\\s+BP\\.$"); + // version line + protected static final Pattern vp = Pattern.compile("^(\\S+?)\\.(\\d+)$"); + // reference position line + protected static final Pattern rpp = Pattern.compile("^(\\d+)(-(\\d+))?,?(\\s\\d+-\\d+,?)*$"); + // dbxref line + protected static final Pattern dbxp = Pattern.compile("^([^:]+):(.+)$"); + + protected static final Pattern readableFileNames = Pattern.compile(".*\\u002e(em|dat).*"); + protected static final Pattern headerLine = Pattern.compile("^ID.*"); + + private NCBITaxon tax = null; + private String organism = null; + private String accession = null; + + /** + * Implements some EMBL-specific terms. + */ + public static class Terms extends RichSequence.Terms { + + /** + * Getter for the RelUpdatedRecordVersion term + * @return The RelUpdatedRecordVersion Term + */ + public static ComparableTerm getRelUpdatedRecordVersionTerm() { + return RichObjectFactory.getDefaultOntology().getOrCreateTerm("RelUpdatedRecordVersion"); + } + + /** + * Getter for the EMBL term + * @return The EMBL Term + */ + public static ComparableTerm getEMBLTerm() { + return RichObjectFactory.getDefaultOntology().getOrCreateTerm("EMBL"); + } + + /** + * Getter for the Ensembl-specific 'genomic' term + * @return The genomic Term + */ + public static ComparableTerm getGenomicTerm() { + return RichObjectFactory.getDefaultOntology().getOrCreateTerm("genomic"); + } + + /** + * Getter for the Ensembl-specific 'versionLine' term + * @return The version line Term + */ + public static ComparableTerm getVersionLineTerm() { + return RichObjectFactory.getDefaultOntology().getOrCreateTerm("versionLine"); + } + + /** + * Getter for the Ensembl-specific 'dataClass' term + * @return The data class Term + */ + public static ComparableTerm getDataClassTerm() { + return RichObjectFactory.getDefaultOntology().getOrCreateTerm("dataClass"); + } + + /** + * Getter for the Ensembl-specific 'organism' term + * @return The organism Term - "ORGANISM_TAG" + * added by xiujuan 2016-1-28 + */ + public static ComparableTerm getOrganismTerm(){ + return RichObjectFactory.getDefaultOntology().getOrCreateTerm("organism"); + } + + /** + * @return The length + * added by xiujuan 2016-1-28 + */ + public static ComparableTerm getLengthTerm(){ + return RichObjectFactory.getDefaultOntology().getOrCreateTerm("length"); + } + + /** + * for the ensembl file "DT" parse + * @return The Date + * added by xiujuan 2016-1-28 + */ + public static ComparableTerm getDateTerm(){ + return RichObjectFactory.getDefaultOntology().getOrCreateTerm("date"); + } + } + + /** + * {@inheritDoc} + * A file is in EMBL format if its name contains the word eem or edat, or the first line matches + * the EMBL format for the ID line. + */ + public boolean canRead(File file) throws IOException { + if (readableFileNames.matcher(file.getName()).matches()) return true; + BufferedReader br = new BufferedReader(new FileReader(file)); + String firstLine = br.readLine(); + boolean readable = firstLine!=null && headerLine.matcher(firstLine).matches() && + (lp.matcher(firstLine.substring(3).trim()).matches() || + lpPre87.matcher(firstLine.substring(3).trim()).matches() + ); + br.close(); + return readable; + } + + /** + * {@inheritDoc} + * Always returns a DNA tokenizer. + */ + public SymbolTokenization guessSymbolTokenization(File file) throws IOException { + return RichSequence.IOTools.getDNAParser(); + } + + /** + * {@inheritDoc} + * A stream is in EMBL format if its first line matches the EMBL format for the ID line. + */ + public boolean canRead(BufferedInputStream stream) throws IOException { + stream.mark(2000); // some streams may not support this + BufferedReader br = new BufferedReader(new InputStreamReader(stream)); + String firstLine = br.readLine(); + boolean readable = firstLine!=null && headerLine.matcher(firstLine).matches() && + (lp.matcher(firstLine.substring(3).trim()).matches() || + lpPre87.matcher(firstLine.substring(3).trim()).matches() + ); + // don't close the reader as it'll close the stream too. + // br.close(); + stream.reset(); + return readable; + } + + /** + * {@inheritDoc} + * Always returns a DNA tokenizer. + */ + public SymbolTokenization guessSymbolTokenization(BufferedInputStream stream) throws IOException { + return RichSequence.IOTools.getDNAParser(); + } + + /** + * {@inheritDoc} + */ + public boolean readSequence(BufferedReader reader, + SymbolTokenization symParser, + SeqIOListener listener) + throws IllegalSymbolException, IOException, ParseException { + if (!(listener instanceof RichSeqIOListener)) throw new IllegalArgumentException("Only accepting RichSeqIOListeners today"); + return this.readRichSequence(reader,symParser,(RichSeqIOListener)listener,null); + } + + /** + * {@inheritDoc} + */ + public boolean readRichSequence(BufferedReader reader, + SymbolTokenization symParser, + RichSeqIOListener rlistener, + Namespace ns) + throws IllegalSymbolException, IOException, ParseException { + tax = null; + organism = null; + accession = null; + boolean hasAnotherSequence = true; + //boolean hasInternalWhitespace = false; + + rlistener.startSequence(); + + if (ns==null) ns=RichObjectFactory.getDefaultNamespace(); + rlistener.setNamespace(ns); + + // Get an ordered list of key->value pairs in array-tuples + String sectionKey = null; + do { + List section = this.readSection(reader); + sectionKey = ((String[])section.get(0))[0]; + if(sectionKey == null){ + + String message = ParseException.newMessage(this.getClass(), accession, "No section key", "Not set", sectionToString(section)); + throw new ParseException(message); + } + // process section-by-section + if (sectionKey.equals(LOCUS_TAG)) { + // entryname dataclass; [circular] molecule; division; sequencelength BP. + String loc = ((String[])section.get(0))[1]; + Matcher m = lp.matcher(loc); + Matcher mPre87 = lpPre87.matcher(loc); + if (m.matches()) { + // first token is both name and primary accession + rlistener.setName(m.group(1)); + rlistener.setAccession(m.group(1)); + // second token is version + rlistener.setVersion(Integer.parseInt(m.group(2))); + // third token is circular/linear + rlistener.setCircular(m.group(3).equals("circular")); + // fourth token is moltype + rlistener.addSequenceProperty(Terms.getMolTypeTerm(),m.group(4)); + // fifth token is data class + rlistener.addSequenceProperty(Terms.getDataClassTerm(),m.group(5)); + // sixth token is taxonomic division + rlistener.setDivision(m.group(6)); + // seventh token is sequence length, which is ignored + rlistener.addSequenceProperty(Terms.getLengthTerm(),m.group(7)); + // as it is calculated from the sequence data later. + } else if (mPre87.matches()) { + rlistener.setName(mPre87.group(1)); + if (mPre87.group(3)!=null) { + // add annotation for 'genomic' (Ensembl-specific term) + rlistener.addSequenceProperty(Terms.getGenomicTerm(),null); + } + rlistener.addSequenceProperty(Terms.getMolTypeTerm(),mPre87.group(4)); + rlistener.setDivision(mPre87.group(5)); + rlistener.addSequenceProperty(Terms.getLengthTerm(), mPre87.group(6)); + // Optional extras + String circular = mPre87.group(2); + if (circular!=null) rlistener.setCircular(true); + } else { + String message = ParseException.newMessage(this.getClass(),accession,"Not Set","Bad ID line found", sectionToString(section)); + throw new ParseException(message); + } + } else if (sectionKey.equals(DEFINITION_TAG)) { + rlistener.setDescription(((String[])section.get(0))[1]); + } else if (sectionKey.equals(SOURCE_TAG)) { + // only interested in organelle sub-tag + for (int i = 1; i < section.size(); i++) { + sectionKey = ((String[])section.get(i))[0]; + if (sectionKey.equals(ORGANELLE_TAG)) { + rlistener.addSequenceProperty(Terms.getOrganelleTerm(), ((String[])section.get(i))[1].trim()); + break; // skip out of for loop once found + } + if(sectionKey.equals(ORGANISM_TAG)){ + rlistener.addSequenceProperty(Terms.getOrganismTerm(), ((String[])section.get(i))[1].trim()); + break; + } + } + } else if (sectionKey.equals(DATE_TAG)) { + String chunk = ((String[])section.get(0))[1].trim(); + Matcher dm = dp.matcher(chunk); + Matcher dm_ensembl = dp_ensembl.matcher(chunk); + if(dm_ensembl.matches()){ + String date = dm_ensembl.group(1); + rlistener.addSequenceProperty(Terms.getDateTerm(),date); + }else if (dm.matches()) { + String date = dm.group(1); + String rel = dm.group(3); + String type = dm.group(4); + if (type.equals("Created")) { + rlistener.addSequenceProperty(Terms.getDateCreatedTerm(), date); + rlistener.addSequenceProperty(Terms.getRelCreatedTerm(), rel); + } else if (type.equals("Last updated, Version ")) { + rlistener.addSequenceProperty(Terms.getDateUpdatedTerm(), date); + rlistener.addSequenceProperty(Terms.getRelUpdatedTerm(), rel); + rlistener.addSequenceProperty(Terms.getRelUpdatedRecordVersionTerm(), dm.group(5)); + } else { + String message = ParseException.newMessage(this.getClass(),accession,"not set", "Bad date type found",sectionToString(section)); + throw new ParseException(message); + } + } else { + String message = ParseException.newMessage(this.getClass(),accession,"not set", "Bad date line found",sectionToString(section)); + throw new ParseException(message); + + } + } else if (sectionKey.equals(ACCESSION_TAG)) { + // if multiple accessions, store only first as accession, + // and store rest in annotation + String[] accs = ((String[])section.get(0))[1].split(";"); + accession = accs[0].trim(); + rlistener.setAccession(accession); + for (int i = 1; i < accs.length; i++) { + rlistener.addSequenceProperty(Terms.getAdditionalAccessionTerm(),accs[i].trim()); + } + } else if (sectionKey.equals(VERSION_TAG)) { + String ver = ((String[])section.get(0))[1]; + /*Matcher m = vp.matcher(ver); + if (m.matches()) { + String verAcc = m.group(1); + if (!accession.equals(verAcc)) { + // the version refers to a different accession! + // believe the version line, and store the original + // accession away in the additional accession set + rlistener.addSequenceProperty(Terms.getAdditionalAccessionTerm(),accession); + accession = verAcc; + rlistener.setAccession(accession); + } + rlistener.setVersion(Integer.parseInt(m.group(2))); + } else {*/ + rlistener.addSequenceProperty(Terms.getVersionLineTerm(),ver); + //} + } else if (sectionKey.equals(KEYWORDS_TAG)) { + String val = ((String[])section.get(0))[1]; + val = val.substring(0,val.length()-1); // chomp dot + val = val.replace('\n',' '); //remove newline + String[] kws = val.split(";"); + for (int i = 0; i < kws.length; i++) { + String kw = kws[i].trim(); + if (kw.length()==0) continue; + rlistener.addSequenceProperty(Terms.getKeywordTerm(), kw); + } + } else if (sectionKey.equals(DATABASE_XREF_TAG)) { + String val = ((String[])section.get(0))[1]; + val = val.substring(0,val.length()-1); // chomp dot + // database_identifier; primary_identifier; secondary_identifier.... + String[] parts = val.split(";"); + // construct a DBXREF out of the dbname part[0] and accession part[1] + CrossRef crossRef = (CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{parts[0].trim(),parts[1].trim(), new Integer(0)}); + // assign remaining bits of info as annotations + for (int j = 2; j < parts.length; j++) { + Note note = new SimpleNote(Terms.getAdditionalAccessionTerm(),parts[j].trim(),j-1); + try { + crossRef.getRichAnnotation().addNote(note); + } catch (ChangeVetoException ce) { + String message = ParseException.newMessage(this.getClass(),accession,"not set", "Could not annotate identifier terms",sectionToString(section)); + ParseException pe = new ParseException(message); + pe.initCause(ce); + throw pe; + } + } + RankedCrossRef rcrossRef = new SimpleRankedCrossRef(crossRef, 0); + rlistener.setRankedCrossRef(rcrossRef); + } else if (sectionKey.equals(REFERENCE_TAG) && !this.getElideReferences()) { + // first line of section has rank and location + String refrank = ((String[])section.get(0))[1]; + int ref_rank = Integer.parseInt(refrank.substring(1,refrank.length()-1)); + int ref_start = -999; + int ref_end = -999; + // rest can be in any order + String consortium = null; + String authors = ""; + String title = null; + String locator = null; + String pubmed = null; + String medline = null; + String doi = null; + String remark = null; + for (int i = 1; i < section.size(); i++) { + String key = ((String[])section.get(i))[0]; + String val = ((String[])section.get(i))[1]; + if (key.equals(AUTHORS_TAG)) { + if (val.endsWith(";")) val = val.substring(0,val.length()-1); // chomp semicolon + authors = val.replace('\n',' '); //see #2276 + } + if (key.equals(CONSORTIUM_TAG)) { + if (val.endsWith(";")) val = val.substring(0,val.length()-1); // chomp semicolon + consortium = val.replace('\n',' '); //see #2276 + } + if (key.equals(TITLE_TAG)) { + if (val.length()>1) { + if (val.endsWith(";")) val = val.substring(0,val.length()-1); // chomp semicolon + if (val.endsWith("\"")) val = val.substring(1,val.length()-1); // chomp quotes + title = val.replace('\n',' '); //see #2276 + } else title=null; // single semi-colon indicates no title + } + if (key.equals(LOCATOR_TAG)) { + if (val.endsWith(".")) val = val.substring(0,val.length()-1); // chomp dot + locator = val.replace('\n',' '); //see #2276 + } + if (key.equals(REFERENCE_XREF_TAG)) { + // database_identifier; primary_identifier. + String[] refs = val.split("\\.(\\s+|$)"); + for (int j = 0 ; j < refs.length; j++) { + if (refs[j].trim().length()==0) continue; + String[] parts = refs[j].split(";"); + String db = parts[0]; + String ref = parts[1].trim(); + if (db.equalsIgnoreCase(Terms.PUBMED_KEY)) pubmed = ref; + else if (db.equalsIgnoreCase(Terms.MEDLINE_KEY)) medline = ref; + else if (db.equalsIgnoreCase(Terms.DOI_KEY)) doi = ref; + } + } + if (key.equals(REMARK_TAG)) remark = val.replace('\n',' '); //see #2276 + if (key.equals(REFERENCE_POSITION_TAG)) { + // only the first group is taken + // if we have multiple lines, only the last line is taken + Matcher m = rpp.matcher(val); + if (m.matches()) { + ref_start = Integer.parseInt(m.group(1)); + if(m.group(2) != null) + ref_end = Integer.parseInt(m.group(3)); + } else { + String message = ParseException.newMessage(this.getClass(),accession,"not set", "Bad reference line found",sectionToString(section)); + throw new ParseException(message); + } + } + } + // create the docref object + try { + List authSet = DocRefAuthor.Tools.parseAuthorString(authors); + if (consortium!=null) authSet.add(new SimpleDocRefAuthor(consortium, true, false)); + DocRef dr = (DocRef)RichObjectFactory.getObject(SimpleDocRef.class,new Object[]{authSet,locator,title}); + // assign either the pubmed or medline to the docref - medline gets priority, then pubmed, then doi + if (medline!=null) dr.setCrossref((CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{Terms.MEDLINE_KEY, medline, new Integer(0)})); + else if (pubmed!=null) dr.setCrossref((CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{Terms.PUBMED_KEY, pubmed, new Integer(0)})); + else if (doi!=null) dr.setCrossref((CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{Terms.DOI_KEY, doi, new Integer(0)})); + // assign the remarks + if (!this.getElideComments()) dr.setRemark(remark); + // assign the docref to the bioentry + RankedDocRef rdr = new SimpleRankedDocRef(dr, + (ref_start != -999 ? new Integer(ref_start) : null), + (ref_end != -999 ? new Integer(ref_end) : null), + ref_rank); + rlistener.setRankedDocRef(rdr); + rlistener.setRankedDocRef(rdr); + } catch (ChangeVetoException e) { + String message = ParseException.newMessage(this.getClass(),accession,"not set", "",sectionToString(section)); + throw new ParseException(e, message); + } + } else if (sectionKey.equals(COMMENT_TAG) && !this.getElideComments()) { + // Set up some comments + rlistener.setComment(((String[])section.get(0))[1]); + } else if (sectionKey.equals(FEATURE_TAG) && !this.getElideFeatures()) { + // starting from second line of input, start a new feature whenever we come across + // a key that does not start with / + boolean seenAFeature = false; + int rcrossrefCount = 0; + for (int i = 1 ; i < section.size(); i++) { + String key = ((String[])section.get(i))[0]; + String val = ((String[])section.get(i))[1]; + if (key.startsWith("/")) { + key = key.substring(1); // strip leading slash + val = val.replaceAll("\\s*[\\n\\r]+\\s*","").trim(); + if (val.startsWith("\"")) val = val.substring(1,val.length()-1).trim(); // strip quotes + // parameter on old feature + if (key.equalsIgnoreCase("db_xref")) { + Matcher m = dbxp.matcher(val); + if (m.matches()) { + String dbname = m.group(1); + String raccession = m.group(2); + if (dbname.equalsIgnoreCase("taxon")) { + // Set the Taxon instead of a dbxref + tax = (NCBITaxon)RichObjectFactory.getObject(SimpleNCBITaxon.class, new Object[]{Integer.valueOf(raccession)}); + rlistener.setTaxon(tax); + try { + if (organism!=null) tax.addName(NCBITaxon.SCIENTIFIC,organism); + } catch (ChangeVetoException e) { + String message = ParseException.newMessage(this.getClass(),accession,"not set", "",sectionToString(section)); + throw new ParseException(e, message); + } + } else { + try { + CrossRef cr = (CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{dbname, raccession, new Integer(0)}); + RankedCrossRef rcr = new SimpleRankedCrossRef(cr, ++rcrossrefCount); + rlistener.getCurrentFeature().addRankedCrossRef(rcr); + } catch (ChangeVetoException e) { + String message = ParseException.newMessage(this.getClass(),accession,"not set", "",sectionToString(section)); + throw new ParseException(e, message); + } + } + } else { + String message = ParseException.newMessage(this.getClass(),accession,"not set", "Bad dbxref found",sectionToString(section)); + throw new ParseException(message); + } + } else if (key.equalsIgnoreCase("organism")) { + try { + organism = val; + if (tax!=null) tax.addName(NCBITaxon.SCIENTIFIC,organism); + } catch (ChangeVetoException e) { + String message = ParseException.newMessage(this.getClass(),accession,"not set", "",sectionToString(section)); + throw new ParseException(message); + } + } else { + if (key.equalsIgnoreCase("translation")) { + // strip spaces from sequence + val = val.replaceAll("\\s+",""); + } + rlistener.addFeatureProperty(RichObjectFactory.getDefaultOntology().getOrCreateTerm(key),val); + } + } else { + // new feature! + // end previous feature + if (seenAFeature) rlistener.endFeature(); + // start next one, with lots of lovely info in it + RichFeature.Template templ = new RichFeature.Template(); + templ.annotation = new SimpleRichAnnotation(); + templ.sourceTerm = Terms.getEMBLTerm(); + templ.typeTerm = RichObjectFactory.getDefaultOntology().getOrCreateTerm(key); + templ.featureRelationshipSet = new TreeSet(); + templ.rankedCrossRefs = new TreeSet(); + String tidyLocStr = val.replaceAll("\\s+",""); + templ.location = GenbankLocationParser.parseLocation(ns, accession, tidyLocStr); + rlistener.startFeature(templ); + seenAFeature = true; + rcrossrefCount = 0; + } + } + if (seenAFeature) rlistener.endFeature(); + } /*else if (sectionKey.equals(START_SEQUENCE_TAG) && !this.getElideSymbols()) { + StringBuffer seq = new StringBuffer(); + for (int i = 0 ; i < section.size(); i++) seq.append(((String[])section.get(i))[1]); + try { + SymbolList sl = new SimpleSymbolList(symParser, + seq.toString().replaceAll("\\s+","").replaceAll("[\\.|~]","-")); + rlistener.addSymbols(symParser.getAlphabet(), + (Symbol[])(sl.toList().toArray(new Symbol[0])), + 0, sl.length()); + } catch (Exception e) { + String message = ParseException.newMessage(this.getClass(),accession,"not set", "Bad sequence",sectionToString(section)); + throw new ParseException(e, message); + } + }*/ + } while (!sectionKey.equals(END_SEQUENCE_TAG)); + + // Allows us to tolerate trailing whitespace without + // thinking that there is another Sequence to follow + while (true) { + reader.mark(1); + int c = reader.read(); + if (c == -1) { + hasAnotherSequence = false; + break; + } + if (Character.isWhitespace((char) c)) { + //hasInternalWhitespace = true; + continue; + } + //if (hasInternalWhitespace) + // System.err.println("Warning: whitespace found between sequence entries"); + reader.reset(); + break; + } + + // Finish up. + rlistener.endSequence(); + return hasAnotherSequence; + } + + // reads an indented section, combining split lines and creating a list of key->value tuples + private List readSection(BufferedReader br) throws ParseException { + List section = new ArrayList(); + String line; + boolean done = false; + + // while not done + try { + while (!done) { + // mark buffer + br.mark(160); + // read token + line = br.readLine(); + if (line.length()<2) { + String message = ParseException.newMessage(this.getClass(),accession,"not set", "Bad line found",line); + throw new ParseException(message); + } + String token = line.substring(0,2); + // READ SEQUENCE SECTION + if (token.equals(START_SEQUENCE_TAG)) { + // from next line, read sequence until // - leave // on stack + StringBuffer sb = new StringBuffer(); + while (!done) { + br.mark(160); + line = br.readLine(); + if (line.startsWith(END_SEQUENCE_TAG)) { + br.reset(); + done = true; + } else { + // create sequence tag->value pair to return, sans numbers + sb.append(line.replaceAll("\\d","")); + } + } + section.add(new String[]{START_SEQUENCE_TAG,sb.toString()}); + } + // READ FEATURE TABLE SECTION + else if (token.equals(FEATURE_HEADER_TAG)) { + // create dummy feature tag->value pair and add to return set + section.add(new String[]{FEATURE_TAG,null}); + // drop next FH line + line = br.readLine(); // skip next line too - it is also FH + // read all FT lines until XX + String currentTag = null; + StringBuffer currentVal = null; + while (!done) { + line = br.readLine(); + if (line.startsWith(DELIMITER_TAG)) { + done = true; + // dump current tag if exists + if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()}); + } else { + // FT lines: FT word value + // or FT /word + // or FT /db_xref="taxon:3899.... + // ......" + line = line.substring(5); // chomp off "FT " + if (!line.startsWith(" ")) { + // dump current tag if exists + if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()}); + // case 1 : word value - splits into key-value on its own + String[] parts = line.trim().split("\\s+"); + currentTag = parts[0]; + currentVal = new StringBuffer(); + currentVal.append(parts[1]); + } else { + line = line.trim(); + if (line.startsWith("/")) { + // dump current tag if exists + if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()}); + // case 2 : /word[=.....] + currentVal = new StringBuffer(); + int equalIndex = line.indexOf('='); + if (equalIndex>=0) { + currentTag = line.substring(0, equalIndex); + currentVal.append(line.substring(equalIndex+1)); + } else { + currentTag = line; + } + } else { + // case 3 : ...." + currentVal.append("\n"); + currentVal.append(line); + } + } + } + } + } + // READ END OF SEQUENCE + else if (token.equals(END_SEQUENCE_TAG)) { + section.add(new String[]{END_SEQUENCE_TAG,null}); + done = true; + } + // READ DELIMITER TAG + else if (token.equals(DELIMITER_TAG)) { + section.add(new String[]{DELIMITER_TAG,null}); + done = true; + } + // READ THIRD PARTY ANNOTATION SECTION + else if (token.equals(TPA_TAG)) { + // exception = don't know how to do TPA yet + String message = ParseException.newMessage(this.getClass(),accession,"not set", "Unable to handle TPAs just yet",sectionToString(section)); + throw new ParseException(message); + } + // READ CONTIG SECTION + //else if (token.equals(CONTIG_TAG)) { + // exception = don't know how to do contigs yet + //String message = ParseException.newMessage(this.getClass(),accession,"not set", "Unable to handle contig assemblies just yet",sectionToString(section)); + //throw new ParseException(message); + //} + //2016.1.27 modified by Xiujuan for parsing file, file containing CONTIG_TAG + else if (token.equals(CONTIG_TAG)) { + section.add(new String[]{CONTIG_TAG,null}); + done = true; + } + // READ DOCREF + else if (token.equals(DATABASE_XREF_TAG)) { + section.add(new String[]{DATABASE_XREF_TAG,line.substring(5).trim()}); + done = true; + } + // READ DATE + else if (token.equals(DATE_TAG)) { + section.add(new String[]{DATE_TAG,line.substring(5).trim()}); + done = true; + } + // READ NORMAL TAG/VALUE SECTION + else { + // rewind buffer to mark + br.reset(); + // read token/values until XX + String currentTag = null; + StringBuffer currentVal = null; + while (!done) { + line = br.readLine(); + if (line.startsWith(DELIMITER_TAG)) { + done = true; + // dump current tag if exists + if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()}); + } else { + try { + // merge neighbouring repeated tokens by concatting values + // return tag->value pairs + String tag = line.substring(0,2); + String value = line.substring(5); + if (currentTag==null || !tag.equals(currentTag)) { + // dump current tag if exists + if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()}); + // start new tag + currentTag = tag; + currentVal = new StringBuffer(); + currentVal.append(value); + } else { + currentVal.append("\n"); + currentVal.append(value); + } + } catch (Exception e) { + String message = ParseException.newMessage(this.getClass(), accession, "not set","",sectionToString(section)); + throw new ParseException(e, message); + } + } + } + } + } + } catch (IOException e) { + String message = ParseException.newMessage(this.getClass(),accession,"not set", "Unable to handle TPAs just yet",sectionToString(section)); + throw new ParseException(message); + } + return section; + } + + /** + * {@inheritDoc} + */ + public void writeSequence(Sequence seq, PrintStream os) throws IOException { + if (this.getPrintStream()==null) this.setPrintStream(os); + this.writeSequence(seq, RichObjectFactory.getDefaultNamespace()); + } + + /** + * {@inheritDoc} + */ + public void writeSequence(Sequence seq, String format, PrintStream os) throws IOException { + if (this.getPrintStream()==null) this.setPrintStream(os); + this.writeSequence(seq, format, RichObjectFactory.getDefaultNamespace()); + } + + /** + * {@inheritDoc} + * Namespace is ignored as EMBL has no concept of it. + */ + public void writeSequence(Sequence seq, Namespace ns) throws IOException { + this.writeSequence(seq, this.getDefaultFormat(), ns); + } + + /** + * As per {@link #writeSequence(Sequence, Namespace)}, except + * that it also takes a format parameter. This can be any of the formats + * defined as constants in this class. + * @param seq see {@link #writeSequence(Sequence, Namespace)} + * @param format the format to use. + * @param ns see {@link #writeSequence(Sequence, Namespace)} + * @throws IOException see {@link #writeSequence(Sequence, Namespace)} + */ + public void writeSequence(Sequence seq, String format, Namespace ns) throws IOException { + if (!format.equals(EMBL_FORMAT) && !format.equals(EMBL_PRE87_FORMAT)) + throw new IllegalArgumentException("Format "+format+" not recognised."); + + RichSequence rs; + try { + if (seq instanceof RichSequence) rs = (RichSequence)seq; + else rs = RichSequence.Tools.enrich(seq); + } catch (ChangeVetoException e) { + IOException e2 = new IOException("Unable to enrich sequence"); + e2.initCause(e); + throw e2; + } + + SymbolTokenization tok; + try { + tok = rs.getAlphabet().getTokenization("token"); + } catch (Exception e) { + throw new RuntimeException("Unable to get alphabet tokenizer",e); + } + + Set notes = rs.getNoteSet(); + String accession = rs.getAccession(); + StringBuffer accessions = new StringBuffer(); + accessions.append(accession); + accessions.append(";"); + String cdat = null; + String udat = null; + String crel = null; + String urel = null; + String urecv = null; + String organelle = null; + String versionLine = null; + String dataClass = "STD"; + boolean genomic = false; + String moltype = rs.getAlphabet().getName(); + for (Iterator i = notes.iterator(); i.hasNext(); ) { + Note n = i.next(); + if (n.getTerm().equals(Terms.getDateCreatedTerm())) cdat=n.getValue(); + else if (n.getTerm().equals(Terms.getDateUpdatedTerm())) udat=n.getValue(); + else if (n.getTerm().equals(Terms.getRelCreatedTerm())) crel=n.getValue(); + else if (n.getTerm().equals(Terms.getRelUpdatedTerm())) urel=n.getValue(); + else if (n.getTerm().equals(Terms.getRelUpdatedRecordVersionTerm())) urecv=n.getValue(); + else if (n.getTerm().equals(Terms.getMolTypeTerm())) moltype=n.getValue(); + else if (n.getTerm().equals(Terms.getVersionLineTerm())) versionLine=n.getValue(); + else if (n.getTerm().equals(Terms.getGenomicTerm())) genomic = true; + else if (n.getTerm().equals(Terms.getDataClassTerm())) dataClass = n.getValue(); + else if (n.getTerm().equals(Terms.getAdditionalAccessionTerm())) { + accessions.append(" "); + accessions.append(n.getValue()); + accessions.append(";"); + } else if (n.getTerm().equals(Terms.getOrganelleTerm())) organelle=n.getValue(); + } + + StringBuffer locusLine = new StringBuffer(); + // Division cannot be null + String div = rs.getDivision(); + if(div==null || div.length()==0 || div.length()>3) + div = "UNC"; //Unclassified + + if (format.equals(EMBL_FORMAT)) { + // accession; SV version; circular/linear; moltype; dataclass; division; length BP. + locusLine.append(rs.getAccession()); + locusLine.append("; SV "); + locusLine.append(rs.getVersion()); + locusLine.append("; "); + locusLine.append(rs.getCircular()?"circular":"linear"); + locusLine.append("; "); + locusLine.append(moltype); + locusLine.append("; "); + locusLine.append(dataClass); + locusLine.append("; "); + locusLine.append(div); + locusLine.append("; "); + locusLine.append(rs.length()); + locusLine.append(" BP."); + } else if (format.equals(EMBL_PRE87_FORMAT)) { + // entryname dataclass; [circular] molecule; division; sequencelength BP. + locusLine.append(StringTools.rightPad(rs.getName(), 9)); + locusLine.append(" standard; "); + locusLine.append(rs.getCircular()?"circular ":""); + // if it is Ensembl genomic, add that in too + if (genomic==true) locusLine.append("genomic "); + locusLine.append(moltype); + locusLine.append("; "); + locusLine.append(div); + locusLine.append("; "); + locusLine.append(rs.length()); + locusLine.append(" BP."); + } + StringTools.writeKeyValueLine(LOCUS_TAG, locusLine.toString(), 5, this.getLineWidth(), null, LOCUS_TAG, this.getPrintStream()); + this.getPrintStream().println(DELIMITER_TAG+" "); + + // accession line + StringTools.writeKeyValueLine(ACCESSION_TAG, accessions.toString(), 5, this.getLineWidth(), null, ACCESSION_TAG, this.getPrintStream()); + this.getPrintStream().println(DELIMITER_TAG+" "); + + // version line + if (format.equals(EMBL_PRE87_FORMAT)) { + if (versionLine!=null) StringTools.writeKeyValueLine(VERSION_TAG, versionLine, 5, this.getLineWidth(), null, VERSION_TAG, this.getPrintStream()); + else StringTools.writeKeyValueLine(VERSION_TAG, accession+"."+rs.getVersion(), 5, this.getLineWidth(), null, VERSION_TAG, this.getPrintStream()); + this.getPrintStream().println(DELIMITER_TAG+" "); + } + + // date line + StringTools.writeKeyValueLine(DATE_TAG, (cdat==null?udat:cdat)+" (Rel. "+(crel==null?"0":crel)+", Created)", 5, this.getLineWidth(), null, DATE_TAG, this.getPrintStream()); + StringTools.writeKeyValueLine(DATE_TAG, udat+" (Rel. "+(urel==null?"0":urel)+", Last updated, Version "+(urecv==null?"0":urecv)+")", 5, this.getLineWidth(), null, DATE_TAG, this.getPrintStream()); + this.getPrintStream().println(DELIMITER_TAG+" "); + + // definition line + StringTools.writeKeyValueLine(DEFINITION_TAG, rs.getDescription(), 5, this.getLineWidth(), null, DEFINITION_TAG, this.getPrintStream()); + this.getPrintStream().println(DELIMITER_TAG+" "); + + // keywords line + StringBuffer keywords = new StringBuffer(); + for (Iterator n = notes.iterator(); n.hasNext(); ) { + Note nt = n.next(); + if (nt.getTerm().equals(Terms.getKeywordTerm())) { + if (keywords.length()>0) keywords.append("; "); + keywords.append(nt.getValue()); + } + } + if (keywords.length()>0) { + keywords.append("."); + StringTools.writeKeyValueLine(KEYWORDS_TAG, keywords.toString(), 5, this.getLineWidth(), null, KEYWORDS_TAG, this.getPrintStream()); + this.getPrintStream().println(DELIMITER_TAG+" "); + } else { + this.getPrintStream().println(KEYWORDS_TAG+" ."); + this.getPrintStream().println(DELIMITER_TAG+" "); + } + + // source line (from taxon) + // organism line + NCBITaxon tax = rs.getTaxon(); + if (tax!=null) { + StringTools.writeKeyValueLine(SOURCE_TAG, tax.getDisplayName(), 5, this.getLineWidth(), null, SOURCE_TAG, this.getPrintStream()); + StringTools.writeKeyValueLine(ORGANISM_TAG, tax.getNameHierarchy(), 5, this.getLineWidth(), null, SOURCE_TAG, this.getPrintStream()); + if (organelle!=null) StringTools.writeKeyValueLine(ORGANELLE_TAG, organelle, 5, this.getLineWidth(), null, ORGANELLE_TAG, this.getPrintStream()); + this.getPrintStream().println(DELIMITER_TAG+" "); + } + + // references - rank (bases x to y) + for (Iterator r = rs.getRankedDocRefs().iterator(); r.hasNext(); ) { + RankedDocRef rdr = r.next(); + DocRef d = rdr.getDocumentReference(); + // RN, RC, RP, RX, RG, RA, RT, RL + StringTools.writeKeyValueLine(REFERENCE_TAG, "["+rdr.getRank()+"]", 5, this.getLineWidth(), null, REFERENCE_TAG, this.getPrintStream()); + StringTools.writeKeyValueLine(REMARK_TAG, d.getRemark(), 5, this.getLineWidth(), null, REMARK_TAG, this.getPrintStream()); + Integer rstart = rdr.getStart(); + if (rstart==null) rstart = new Integer(1); + Integer rend = rdr.getEnd(); + if (rend==null) rend = new Integer(rs.length()); + StringTools.writeKeyValueLine(REFERENCE_POSITION_TAG, rstart+"-"+rend, 5, this.getLineWidth(), null, REFERENCE_POSITION_TAG, this.getPrintStream()); + CrossRef c = d.getCrossref(); + if (c!=null) StringTools.writeKeyValueLine(REFERENCE_XREF_TAG, c.getDbname()+"; "+c.getAccession()+".", 5, this.getLineWidth(), null, REFERENCE_XREF_TAG, this.getPrintStream()); + List auths = d.getAuthorList(); + for (Iterator j = auths.iterator(); j.hasNext(); ) { + DocRefAuthor a = j.next(); + if (a.isConsortium()) { + StringTools.writeKeyValueLine(CONSORTIUM_TAG, a+";", 5, this.getLineWidth(), null, CONSORTIUM_TAG, this.getPrintStream()); + j.remove(); + } + } + if (!auths.isEmpty()) StringTools.writeKeyValueLine(AUTHORS_TAG, DocRefAuthor.Tools.generateAuthorString(auths, true)+";", 5, this.getLineWidth(), null, AUTHORS_TAG, this.getPrintStream()); + else StringTools.writeKeyValueLine(AUTHORS_TAG, ";", 5, this.getLineWidth(), null, AUTHORS_TAG, this.getPrintStream()); + if (d.getTitle()!=null && d.getTitle().length()!=0) StringTools.writeKeyValueLine(TITLE_TAG, "\""+d.getTitle()+"\";", 5, this.getLineWidth(), null, TITLE_TAG, this.getPrintStream()); + else StringTools.writeKeyValueLine(TITLE_TAG, ";", 5, this.getLineWidth(), null, TITLE_TAG, this.getPrintStream()); + StringTools.writeKeyValueLine(LOCATOR_TAG, d.getLocation()+".", 5, this.getLineWidth(), null, LOCATOR_TAG, this.getPrintStream()); + this.getPrintStream().println(DELIMITER_TAG+" "); + } + + // db references - ranked + for (Iterator r = rs.getRankedCrossRefs().iterator(); r.hasNext(); ) { + RankedCrossRef rcr = r.next(); + CrossRef c = rcr.getCrossRef(); + Set noteset = c.getNoteSet(); + StringBuffer sb = new StringBuffer(); + sb.append(c.getDbname()); + sb.append("; "); + sb.append(c.getAccession()); + boolean hasSecondary = false; + for (Iterator i = noteset.iterator(); i.hasNext(); ) { + Note n = i.next(); + if (n.getTerm().equals(Terms.getAdditionalAccessionTerm())) { + sb.append("; "); + sb.append(n.getValue()); + hasSecondary = true; + } + } + //if (!hasSecondary) sb.append("; -"); + //sb.append("."); + if (!hasSecondary) sb.append(";"); + else sb.append("."); + StringTools.writeKeyValueLine(DATABASE_XREF_TAG, sb.toString(), 5, this.getLineWidth(), null, DATABASE_XREF_TAG, this.getPrintStream()); + } + if (!rs.getRankedCrossRefs().isEmpty()) + this.getPrintStream().println(DELIMITER_TAG+" "); + + // comments - if any + if (!rs.getComments().isEmpty()) { + StringBuffer sb = new StringBuffer(); + for (Iterator i = rs.getComments().iterator(); i.hasNext(); ) { + Comment c = i.next(); + sb.append(c.getComment()); + if (i.hasNext()) sb.append("\n"); + } + StringTools.writeKeyValueLine(COMMENT_TAG, sb.toString(), 5, this.getLineWidth(), null, COMMENT_TAG, this.getPrintStream()); + this.getPrintStream().println(DELIMITER_TAG+" "); + } + + this.getPrintStream().println(FEATURE_HEADER_TAG+" Key Location/Qualifiers"); + this.getPrintStream().println(FEATURE_HEADER_TAG+" "); + // feature_type location + for (Iterator i = rs.getFeatureSet().iterator(); i.hasNext(); ) { + RichFeature f = (RichFeature)i.next(); + StringTools.writeKeyValueLine(FEATURE_TAG+" "+f.getTypeTerm().getName(), GenbankLocationParser.writeLocation((RichLocation)f.getLocation()), 21, this.getLineWidth(), ",", FEATURE_TAG, this.getPrintStream()); + for (Iterator j = f.getNoteSet().iterator(); j.hasNext(); ) { + Note n = j.next(); + // /key="val" or just /key if val=="" + if (n.getValue()==null || n.getValue().length()==0) StringTools.writeKeyValueLine(FEATURE_TAG, "/"+n.getTerm().getName(), 21, this.getLineWidth(), null, FEATURE_TAG, this.getPrintStream()); + else StringTools.writeKeyValueLine(FEATURE_TAG, "/"+n.getTerm().getName()+"=\""+n.getValue()+"\"", 21, this.getLineWidth(), null, FEATURE_TAG, this.getPrintStream()); + } + // add-in to source feature only organism and db_xref="taxon:xyz" where present + if (f.getType().equals("source") && tax!=null) { + String displayName = tax.getDisplayName(); + if (displayName.indexOf('(')>-1) displayName = displayName.substring(0, displayName.indexOf('(')).trim(); + StringTools.writeKeyValueLine(FEATURE_TAG, "/organism=\""+displayName+"\"", 21, this.getLineWidth(), null, FEATURE_TAG, this.getPrintStream()); + StringTools.writeKeyValueLine(FEATURE_TAG, "/db_xref=\"taxon:"+tax.getNCBITaxID()+"\"", 21, this.getLineWidth(), null, FEATURE_TAG, this.getPrintStream()); + } + // add-in other dbxrefs where present + for (Iterator j = f.getRankedCrossRefs().iterator(); j.hasNext(); ) { + RankedCrossRef rcr = j.next(); + CrossRef cr = rcr.getCrossRef(); + StringTools.writeKeyValueLine(FEATURE_TAG, "/db_xref=\""+cr.getDbname()+":"+cr.getAccession()+"\"", 21, this.getLineWidth(), null, FEATURE_TAG, this.getPrintStream()); + } + } + this.getPrintStream().println(DELIMITER_TAG+" "); + + // SQ Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other; + int aCount = 0; + int cCount = 0; + int gCount = 0; + int tCount = 0; + int oCount = 0; + for (int i = 1; i <= rs.length(); i++) { + char c; + try { + c = tok.tokenizeSymbol(rs.symbolAt(i)).charAt(0); + } catch (Exception e) { + throw new RuntimeException("Unable to get symbol at position "+i,e); + } + switch (c) { + case 'a': case 'A': + aCount++; + break; + case 'c': case 'C': + cCount++; + break; + case 'g': case 'G': + gCount++; + break; + case 't': case 'T': + tCount++; + break; + default: + oCount++; + } + } + this.getPrintStream().print(START_SEQUENCE_TAG+" Sequence "+rs.length()+" BP; "); + this.getPrintStream().print(aCount + " A; "); + this.getPrintStream().print(cCount + " C; "); + this.getPrintStream().print(gCount + " G; "); + this.getPrintStream().print(tCount + " T; "); + this.getPrintStream().println(oCount + " other;"); + + // sequence stuff + Symbol[] syms = (Symbol[])rs.toList().toArray(new Symbol[0]); + int lineLen = 0; + int symCount = 0; + this.getPrintStream().print(" "); + for (int i = 0; i < syms.length; i++) { + if (symCount % 60 == 0 && symCount>0) { + this.getPrintStream().print(StringTools.leftPad(""+symCount,10)); + this.getPrintStream().print("\n "); + lineLen = 0; + } + if (symCount % 10 == 0) { + this.getPrintStream().print(" "); + lineLen++; + } + try { + this.getPrintStream().print(tok.tokenizeSymbol(syms[i])); + } catch (IllegalSymbolException e) { + throw new RuntimeException("Found illegal symbol: "+syms[i]); + } + symCount++; + lineLen++; + } + this.getPrintStream().print(StringTools.leftPad(""+symCount,(66-lineLen)+10)); + this.getPrintStream().print("\n"); + this.getPrintStream().println(END_SEQUENCE_TAG); + } + + /** + * {@inheritDoc} + */ + public String getDefaultFormat() { + return EMBL_FORMAT; + } + + + /** + * Converts the current parse section to a String. Useful for debugging. + */ + String sectionToString(List section){ + StringBuffer parseBlock = new StringBuffer(); + for(Iterator i = section.listIterator(); i.hasNext();){ + String[] part = (String[])i.next(); + for(int x = 0; x < part.length; x++){ + parseBlock.append(part[x]); + if(x == 0){ + parseBlock.append(" "); //the gap will have been trimmed + } + } + } + return parseBlock.toString(); + } +} diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/util/CustomIOTools.java b/piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/util/CustomIOTools.java index afed93a..0e1c65b 100644 --- a/piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/util/CustomIOTools.java +++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/util/CustomIOTools.java @@ -1,6 +1,5 @@ package cn.piflow.bundle.microorganism.util; - import org.biojava.bio.BioError; import org.biojava.bio.BioException; import org.biojava.bio.seq.*; @@ -662,10 +661,19 @@ public interface CustomIOTools { * @return a RichSequenceIterator over each sequence in the * fasta file */ + public static RichSequenceIterator readEMBLDNA(BufferedReader br, + Namespace ns) { + return new RichStreamReader(br, new CustomEMBLFormat(), getDNAParser(), + factory, ns); + } - - + //parse Ensembl file + public static RichSequenceIterator readEnsembl(BufferedReader br, + Namespace ns) { + return new RichStreamReader(br, new CustomEnsemblFormat(), getDNAParser(), + factory, ns); + } /** * Iterate over the sequences in an EMBL-format stream of RNA sequences. @@ -753,7 +761,11 @@ public interface CustomIOTools { * @return a RichSequenceIterator over each sequence in the * fasta file */ - + public static RichSequenceIterator readUniProt(BufferedReader br, + Namespace ns) { + return new RichStreamReader(br, new CustomUniProtFormat(), + getProteinParser(), factory, ns); + } /** * Read a UniProt XML file using a custom type of SymbolList. For diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/util/CustomUniProtFormat.java b/piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/util/CustomUniProtFormat.java new file mode 100644 index 0000000..5478a5e --- /dev/null +++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/util/CustomUniProtFormat.java @@ -0,0 +1,1291 @@ +package cn.piflow.bundle.microorganism.util; + +import org.biojava.bio.proteomics.MassCalc; +import org.biojava.bio.seq.Sequence; +import org.biojava.bio.seq.io.ParseException; +import org.biojava.bio.seq.io.SeqIOListener; +import org.biojava.bio.seq.io.SymbolTokenization; +import org.biojava.bio.symbol.*; +import org.biojava.ontology.Term; +import org.biojava.utils.ChangeVetoException; +import org.biojavax.*; +import org.biojavax.bio.seq.RichFeature; +import org.biojavax.bio.seq.RichLocation; +import org.biojavax.bio.seq.RichSequence; +import org.biojavax.bio.seq.io.RichSeqIOListener; +import org.biojavax.bio.seq.io.RichSequenceFormat; +import org.biojavax.bio.seq.io.UniProtCommentParser; +import org.biojavax.bio.seq.io.UniProtLocationParser; +import org.biojavax.bio.taxa.NCBITaxon; +import org.biojavax.bio.taxa.SimpleNCBITaxon; +import org.biojavax.ontology.ComparableTerm; +import org.biojavax.utils.CRC64Checksum; +import org.biojavax.utils.StringTools; + +import java.io.*; +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Created by xiujuan on 2016/5/11. + */ +public class CustomUniProtFormat extends RichSequenceFormat.HeaderlessFormat{ + + + // Register this format with the format auto-guesser. + static { + RichSequence.IOTools.registerFormat(CustomUniProtFormat.class); + } + + /** + * The name of this format + */ + public static final String UNIPROT_FORMAT = "UniProt"; + + private static final String SUBFORMAT_UNIPROT = "UniProt"; + private static final String SUBFORMAT_IPI = "IPI"; + + protected static final String LOCUS_TAG = "ID"; + protected static final String ACCESSION_TAG = "AC"; + protected static final String DEFINITION_TAG = "DE"; + protected static final String DATE_TAG = "DT"; + protected static final String SOURCE_TAG = "OS"; + protected static final String ORGANELLE_TAG = "OG"; + protected static final String ORGANISM_TAG = "OC"; + protected static final String TAXON_TAG = "OX"; + protected static final String GENE_TAG = "GN"; + protected static final String DATABASE_XREF_TAG = "DR"; + protected static final String PROTEIN_EXIST_TAG = "PE"; + protected static final String REFERENCE_TAG = "RN"; + protected static final String RP_LINE_TAG = "RP"; + protected static final String REFERENCE_XREF_TAG = "RX"; + protected static final String AUTHORS_TAG = "RA"; + protected static final String CONSORTIUM_TAG = "RG"; + protected static final String TITLE_TAG = "RT"; + protected static final String LOCATION_TAG = "RL"; + protected static final String RC_LINE_TAG = "RC"; + protected static final String KEYWORDS_TAG = "KW"; + protected static final String COMMENT_TAG = "CC"; + protected static final String FEATURE_TAG = "FT"; + protected static final String START_SEQUENCE_TAG = "SQ"; + protected static final String END_SEQUENCE_TAG = "//"; + protected static final String ORGANISM_HOST_TAG = "OH"; + + // locus line for uniprot format + protected static final Pattern lp_uniprot = Pattern.compile("^((\\S+)_(\\S+))\\s+(\\S+);\\s+(PRT)?;?\\s*\\d+\\s+AA\\.$"); + // locus line for IPI format + protected static final Pattern lp_ipi = Pattern.compile("^((\\S+)\\.(\\d+))\\s+(IPI);\\s+(PRT)?;?\\s*\\d+\\s+AA\\.$"); + // RP line parser + protected static final Pattern rppat = Pattern.compile("SEQUENCE OF (\\d+)-(\\d+)"); + // date lineDT for uniprot + // date, integrated into UniProtKB/database_name. + // date, sequence version x. + // date, entry version x. + protected static final Pattern dp_uniprot = Pattern.compile("([^,]+),([^\\d\\.]+)(\\d+)?\\.$"); + // date lineDT for IPI + // date (xxx, Created) + // date (xxx, Last sequence update) + protected static final Pattern dp_ipi = Pattern.compile("([^\\(]+)\\(([^,]+),([^\\)]+)\\)$"); + // feature line + protected static final Pattern fp = Pattern.compile("^\\s*([\\d?<]+\\s+[\\d?>]+)(\\s+(.*))?$"); + + protected static final Pattern headerLine = Pattern.compile("^ID.*"); + + /** + * Implements some UniProt-specific terms. + */ + public static class Terms extends RichSequence.Terms { + private static String GENENAME_KEY = "Name"; + private static String GENESYNONYM_KEY = "Synonyms"; + private static String ORDLOCNAME_KEY = "OrderedLocusNames"; + private static String ORFNAME_KEY = "ORFNames"; + + /** + * Getter for the UniProt term + * @return The UniProt Term + */ + public static ComparableTerm getUniProtTerm() { + return RichObjectFactory.getDefaultOntology().getOrCreateTerm("UniProt"); + } + + /** + * Getter for the UniProt combined database term + * @return The combined database for UniProt Term + */ + public static ComparableTerm getUniProtDBNameTerm() { + return RichObjectFactory.getDefaultOntology().getOrCreateTerm("UniProt database name"); + } + + /** + * Getter for the protein exists term + * @return The protein exists Term + */ + public static ComparableTerm getProteinExistsTerm() { + return RichObjectFactory.getDefaultOntology().getOrCreateTerm("UniProt protein exists"); + } + + public static ComparableTerm getOrganismHostTerm(){ + return RichObjectFactory.getDefaultOntology().getOrCreateTerm("Organism host"); + } + + public static ComparableTerm getSequenceMetaInfoTerm(){ + return RichObjectFactory.getDefaultOntology().getOrCreateTerm("Sequence meta info"); + } + } + + /** + * {@inheritDoc} + * A file is in UniProt format if the first line matches the UniProt format for the ID line. + */ + public boolean canRead(File file) throws IOException { + BufferedReader br = new BufferedReader(new FileReader(file)); + String firstLine = br.readLine(); + boolean readable = firstLine!=null && headerLine.matcher(firstLine).matches() && + (lp_uniprot.matcher(firstLine.substring(3).trim()).matches() || + lp_ipi.matcher(firstLine.substring(3).trim()).matches()); + br.close(); + return readable; + } + + /** + * {@inheritDoc} + * Always returns a protein tokenizer. + */ + public SymbolTokenization guessSymbolTokenization(File file) throws IOException { + return RichSequence.IOTools.getProteinParser(); + } + + /** + * {@inheritDoc} + * A stream is in UniProt format if the first line matches the UniProt format for the ID line. + */ + public boolean canRead(BufferedInputStream stream) throws IOException { + stream.mark(2000); // some streams may not support this + BufferedReader br = new BufferedReader(new InputStreamReader(stream)); + String firstLine = br.readLine(); + boolean readable = firstLine!=null && headerLine.matcher(firstLine).matches() && + (lp_uniprot.matcher(firstLine.substring(3).trim()).matches() + || lp_ipi.matcher(firstLine.substring(3).trim()).matches()); + // don't close the reader as it'll close the stream too. + // br.close(); + stream.reset(); + return readable; + } + + /** + * {@inheritDoc} + * Always returns a protein tokenizer. + */ + public SymbolTokenization guessSymbolTokenization(BufferedInputStream stream) throws IOException { + return RichSequence.IOTools.getProteinParser(); + } + + /** + * {@inheritDoc} + */ + public boolean readSequence(BufferedReader reader, + SymbolTokenization symParser, + SeqIOListener listener) + throws IllegalSymbolException, IOException, ParseException { + if (!(listener instanceof RichSeqIOListener)) throw new IllegalArgumentException("Only accepting RichSeqIOListeners today"); + return this.readRichSequence(reader,symParser,(RichSeqIOListener)listener,null); + } + + private String accession = null; + + /** + * {@inheritDoc} + */ + public boolean readRichSequence(BufferedReader reader, + SymbolTokenization symParser, + RichSeqIOListener rlistener, + Namespace ns) + throws IllegalSymbolException, IOException, ParseException { + + boolean hasAnotherSequence = true; + //boolean hasInternalWhitespace = false; + + String subformat = SUBFORMAT_UNIPROT; + + rlistener.startSequence(); + + if (ns==null) ns=RichObjectFactory.getDefaultNamespace(); + rlistener.setNamespace(ns); + + // Get an ordered list of key->value pairs in array-tuples + String sectionKey = null; + NCBITaxon tax = null; + accession = null; + List section = null; + try{ + do { + + section = this.readSection(reader); + sectionKey = ((String[])section.get(0))[0]; + if(sectionKey == null){ + String message = ParseException.newMessage(this.getClass(),accession, "", "Section key was null", sectionToString(section)); + throw new ParseException(message); + } + // process section-by-section + if (sectionKey.equals(LOCUS_TAG)) { + // entryname dataclass; moltype; sequencelength AA. + String loc = ((String[])section.get(0))[1]; + Matcher m = lp_uniprot.matcher(loc); + if (m.matches()) { + rlistener.setName(m.group(2)); + rlistener.setDivision(m.group(3)); + if (m.groupCount() > 4){ + rlistener.addSequenceProperty(Terms.getDataClassTerm(),m.group(4)); + rlistener.addSequenceProperty(Terms.getMolTypeTerm(),m.group(5)); + }else{ + rlistener.addSequenceProperty(Terms.getDataClassTerm(), m.group(4)); + rlistener.addSequenceProperty(Terms.getMolTypeTerm(), ""); + } + } else { + m = lp_ipi.matcher(loc); + if (m.matches()) { + subformat = SUBFORMAT_IPI; + rlistener.setName(m.group(2)); + rlistener.setVersion(Integer.parseInt(m.group(3))); + rlistener.addSequenceProperty(Terms.getDataClassTerm(), m.group(4)); + rlistener.addSequenceProperty(Terms.getMolTypeTerm(),m.group(5)); + } else { + String message = ParseException.newMessage(this.getClass(),accession, "", "Bad ID line", sectionToString(section)); + throw new ParseException(message); + } + } + } else if (sectionKey.equals(DEFINITION_TAG)) { + String val = ((String[])section.get(0))[1]; + if (val.endsWith(".")) val = val.substring(0, val.length()-1); // chomp dot + rlistener.setDescription(val); + } else if (sectionKey.equals(SOURCE_TAG)) { + // use SOURCE_TAG and TAXON_TAG values + String sciname = null; + String comname = null; + List synonym = new ArrayList(); + List lineage = new ArrayList(); + int taxid = 0; + for (int i = 0; i < section.size(); i++) { + String tag = ((String[])section.get(i))[0]; + String value = ((String[])section.get(i))[1].trim(); + value = value.replace("\n", " "); + value = value.replace("\r\n", " "); + + if (tag.equals(SOURCE_TAG)) { + if (value.endsWith(".")) value = value.substring(0,value.length()-1); // chomp trailing dot + String[] parts = value.split("\\("); + sciname = parts[0].trim(); + if (parts.length>1) { + comname = parts[1].trim(); + if (comname.endsWith(")")) comname = comname.substring(0,comname.length()-1); // chomp trailing bracket + if (parts.length>2) { + // synonyms + for (int j = 2 ; j < parts.length; j++) { + String syn = parts[j].trim(); + if (syn.endsWith(")")) syn = syn.substring(0,syn.length()-1); // chomp trailing bracket + synonym.add(syn); + } + } + } + } else if (tag.equals(TAXON_TAG)) { + String[] parts = value.split(";"); + for (int j = 0; j < parts.length; j++) { + String[] bits = parts[j].split("="); + if (bits[0].equals("NCBI_TaxID")) { + String[] morebits = bits[1].split(","); + taxid = Integer.parseInt(morebits[0].split(" ")[0].trim()); + } + } + } else if (tag.equals(ORGANELLE_TAG)) { + if (value.endsWith(".")) value = value.substring(0,value.length()-1); // chomp trailing dot + String[] parts = value.split(";"); + for (int j = 0; j < parts.length; j++) { + parts[j]=parts[j].trim(); + rlistener.addSequenceProperty(Terms.getOrganelleTerm(),parts[j]); + } + } + //added by xiujuan 2016.5.12 + else if(tag.equals(ORGANISM_TAG)){ + if (value.endsWith(".")) value = value.substring(0,value.length()-1); // chomp trailing dot + String[] parts = value.split(";"); + for (int j = 0; j < parts.length; j++) { + parts[j]=parts[j].trim(); + lineage.add(parts[j]); + } + }else if(tag.equals(ORGANISM_HOST_TAG)) { //"OH"tag Organism Host + String[] parts = value.split("\\. "); + for(int j = 0; j < parts.length; j++){ + rlistener.addSequenceProperty(Terms.getOrganismHostTerm(),parts[j]); + } + } + } + // Set the Taxon + tax = (NCBITaxon)RichObjectFactory.getObject(SimpleNCBITaxon.class, new Object[]{new Integer(taxid)}); + rlistener.setTaxon(tax); + try { + if (sciname!=null) tax.addName(NCBITaxon.SCIENTIFIC,sciname); + if (comname!=null) tax.addName(NCBITaxon.COMMON,comname); + for (Iterator j = synonym.iterator(); j.hasNext(); ) tax.addName(NCBITaxon.SYNONYM, (String)j.next()); + for(Iterator j = lineage.iterator();j.hasNext();)tax.addName("lineage",(String)j.next()); + } catch (ChangeVetoException e) { + throw new ParseException(e); + } + } else if (sectionKey.equals(DATE_TAG)) { + String chunk = ((String[])section.get(0))[1]; + if(subformat.equals(SUBFORMAT_UNIPROT)) { + Matcher dm = dp_uniprot.matcher(chunk); + if (dm.matches()) { + String date = dm.group(1).trim(); + String type = dm.group(2).trim(); + String rel = dm.group(3); + if (rel!=null) rel = rel.trim(); + if (type.startsWith("integrated into UniProtKB")) { + String dbname = type.split("/")[1]; + rlistener.addSequenceProperty(Terms.getDateCreatedTerm(), date); + rlistener.addSequenceProperty(Terms.getUniProtDBNameTerm(), dbname); + } else if (type.equalsIgnoreCase("sequence version")) { + if (rel==null){ + String message = ParseException.newMessage(this.getClass(),accession, "", "Version missing for "+type, sectionToString(section)); + throw new ParseException(message); + } + rlistener.addSequenceProperty(Terms.getDateUpdatedTerm(), date); + rlistener.setVersion(Integer.parseInt(rel)); + } else if (type.equalsIgnoreCase("entry version")) { + if (rel==null) { + String message = ParseException.newMessage(this.getClass(),accession, "", "Version missing for "+type, sectionToString(section)); + throw new ParseException(message); + } + rlistener.addSequenceProperty(Terms.getDateAnnotatedTerm(), date); + rlistener.addSequenceProperty(Terms.getRelAnnotatedTerm(), rel); + } else { + String message = ParseException.newMessage(this.getClass(),accession, "", "Bad date type "+type, sectionToString(section)); + throw new ParseException(message); + } + } else { + String message = ParseException.newMessage(this.getClass(),accession, "", "Bad date line", sectionToString(section)); + throw new ParseException(message); + } + } else if(subformat.equals(SUBFORMAT_IPI)) { + Matcher dm = dp_ipi.matcher(chunk); + if (dm.matches()) { + String date = dm.group(1).trim(); + String type = dm.group(3).trim(); + if(type.equals("Created")) { + rlistener.addSequenceProperty(Terms.getDateCreatedTerm(), date); + } else if(type.equals("Last sequence update")) { + rlistener.addSequenceProperty(Terms.getDateUpdatedTerm(), date); + } else { + String message = ParseException.newMessage(this.getClass(),accession, "", "Bad date type "+type, sectionToString(section)); + throw new ParseException(message); + } + } else { + String message = ParseException.newMessage(this.getClass(),accession, "", "Bad date line", sectionToString(section)); + throw new ParseException(message); + } + } else { + String message = ParseException.newMessage(this.getClass(),accession, "", "Unknown date line format", sectionToString(section)); + throw new ParseException(message); + } + } else if (sectionKey.equals(ACCESSION_TAG)) { + // if multiple accessions, store only first as accession, + // and store rest in annotation + String[] accs = ((String[])section.get(0))[1].split(";"); + if(accs.length>0) accession = accs[0].trim(); else accession = ""; + rlistener.setAccession(accession); + for (int i = 1; i < accs.length; i++) { + rlistener.addSequenceProperty(Terms.getAdditionalAccessionTerm(),accs[i].trim()); + } + } else if (sectionKey.equals(PROTEIN_EXIST_TAG)) { + String val = ((String[])section.get(0))[1]; + if (val.endsWith(";")) val = val.substring(0, val.length()-1); // chomp semicolon + rlistener.addSequenceProperty(Terms.getProteinExistsTerm(),val.trim()); + } else if (sectionKey.equals(KEYWORDS_TAG)) { + String val = ((String[])section.get(0))[1]; + if (val.endsWith(".")) val = val.substring(0, val.length()-1); // chomp dot + val = val.replace('\n',' '); //remove newline + String[] kws = val.split(";"); + for (int i = 0; i < kws.length; i++) { + String kw = kws[i].trim(); + if (kw.length()==0) continue; + rlistener.addSequenceProperty(Terms.getKeywordTerm(), kw); + } + } else if (sectionKey.equals(GENE_TAG)) { + String[] genes = ((String[])section.get(0))[1].split("\\s+(or|and)\\s+"); + for (int geneID = 0; geneID < genes.length; geneID++) { + String[] parts = genes[geneID].replace('\n', ' ').split(";"); + for (int j = 0; j < parts.length; j++) { + if(parts[j].matches(".+=.+")){ + String[] moreparts = parts[j].split("="); + String[] values = moreparts[1].split(","); + // nasty hack - we really should have notes on the gene object itself... if such a thing existed... + if (moreparts[0].trim().equals(Terms.GENENAME_KEY)) rlistener.addSequenceProperty(Terms.getGeneNameTerm(),geneID+":"+values[0].trim()); + else if (moreparts[0].trim().equals(Terms.GENESYNONYM_KEY)) { + for (int k = 0; k < values.length; k++) rlistener.addSequenceProperty(Terms.getGeneSynonymTerm(),geneID+":"+values[k].trim()); + } else if (moreparts[0].trim().equals(Terms.ORDLOCNAME_KEY)) { + for (int k = 0; k < values.length; k++) rlistener.addSequenceProperty(Terms.getOrderedLocusNameTerm(),geneID+":"+values[k].trim()); + } else if (moreparts[0].trim().equals(Terms.ORFNAME_KEY)) { + for (int k = 0; k < values.length; k++) rlistener.addSequenceProperty(Terms.getORFNameTerm(),geneID+":"+values[k].trim()); + } + } + } + } + } else if (sectionKey.equals(DATABASE_XREF_TAG)) { + // database_identifier; primary_identifier; secondary_identifier.... + String val = ((String[])section.get(0))[1]; + if (val.endsWith(".")) val = val.substring(0, val.length()-1); // chomp dot + String[] parts = val.split(";"); + // construct a DBXREF out of the dbname part[0] and accession part[1] + String dbname = parts[0].trim(); + String acc = parts[1].trim(); + CrossRef crossRef = (CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{dbname,acc,new Integer(0)}); + // assign remaining bits of info as additional accession annotations + for (int j = 2; j < parts.length; j++) { + ComparableTerm t = (ComparableTerm) Terms.getAdditionalAccessionTerm(); + Note note = new SimpleNote(t,parts[j].trim(),j-1); + try { + crossRef.getRichAnnotation().addNote(note); + } catch (ChangeVetoException ce) { + ParseException pe = new ParseException("Could not annotate additional accession terms"); + pe.initCause(ce); + throw pe; + } + } + RankedCrossRef rcrossRef = new SimpleRankedCrossRef(crossRef, 0); + rlistener.setRankedCrossRef(rcrossRef); + } else if (sectionKey.equals(REFERENCE_TAG) && !this.getElideReferences()) { + // first line of section has rank and location + String refrank = ((String[])section.get(0))[1]; + refrank = refrank.trim().split(" ")[0]; + int ref_rank = Integer.parseInt(refrank.substring(1,refrank.length()-1)); + // rest can be in any order + String authors = null; + String consortium = null; + String title = null; + String locator = null; + String pubmed = null; + String medline = null; + String doi = null; + String remark = null; + Integer rstart = null; + Integer rend = null; + for (int i = 1; i < section.size(); i++) { + String key = ((String[])section.get(i))[0]; + String val = ((String[])section.get(i))[1]; + //System.err.println(key+": "+val); + if (key.equals(AUTHORS_TAG)) { + if (val.endsWith(";")) val = val.substring(0, val.length()-1); // chomp semicolon + authors = val.replace('\n',' '); //see #2276 + } + if (key.equals(CONSORTIUM_TAG)) { + if (val.endsWith(";")) val = val.substring(0, val.length()-1); // chomp semicolon + consortium = val.replace('\n',' '); //see #2276 + } + if (key.equals(TITLE_TAG)) { + if (val.endsWith(";")) val = val.substring(0, val.length()-1); // chomp semicolon + if (val.endsWith("\"")) val = val.substring(1, val.length()-1); // chomp quotes + title = val.replace('\n',' '); //see #2276 + } + if (key.equals(LOCATION_TAG)) { + if (val.endsWith(".")) val = val.substring(0, val.length()-1); // chomp dot + locator = val.replace('\n',' '); //see #2276 + } + if (key.equals(REFERENCE_XREF_TAG)) { + // database_identifier=primary_identifier; + String[] refs = val.split(";"); + for (int j = 0 ; j < refs.length; j++) { + if (refs[j].trim().length()==0) continue; + String[] parts = refs[j].split("="); + if ( parts.length <2) { + // some DOI lines look like this and are causing problems: + //DOI=10.1002/(SICI)1097-0215(19990702)82:1<137::AID-IJC23>3.0.CO;2-F;ignoring + System.err.println("warning: problems while parsing: " + val); + continue; + } + String db = parts[0].trim(); + String ref = parts[1].trim(); + if (db.equalsIgnoreCase(Terms.PUBMED_KEY)) pubmed = ref; + else if (db.equalsIgnoreCase(Terms.MEDLINE_KEY)) medline = ref; + else if (db.equalsIgnoreCase(Terms.DOI_KEY)) doi = ref; + } + } + if (key.equals(RP_LINE_TAG)) { + if (val.endsWith(".")) val = val.substring(0, val.length()-1); // chomp dot + remark = val.replace('\n',' '); //see #2276 + // Try to use it to find the location of the reference, if we have one. + Matcher m = rppat.matcher(val); + if (m.matches()) { + rstart = Integer.valueOf(m.group(1)); + rend = Integer.valueOf(m.group(2)); + } + } + if (key.equals(RC_LINE_TAG)) { + // Split into key=value pairs separated by semicolons and terminated with semicolon. + String[] parts = val.split(";"); + for (int j = 0; j < parts.length; j++) { + String[] subparts = parts[j].split("="); + // get term for first section + String termName = subparts[0].trim(); + Term t; + if (termName.equalsIgnoreCase(Terms.SPECIES_KEY)) t = Terms.getSpeciesTerm(); + else if (termName.equalsIgnoreCase(Terms.STRAIN_KEY)) t = Terms.getStrainTerm(); + else if (termName.equalsIgnoreCase(Terms.TISSUE_KEY)) t = Terms.getTissueTerm(); + else if (termName.equalsIgnoreCase(Terms.TRANSPOSON_KEY)) t = Terms.getTransposonTerm(); + else if (termName.equalsIgnoreCase(Terms.PLASMID_KEY)) t = Terms.getPlasmidTerm(); + else { + String message = ParseException.newMessage(this.getClass(),accession, "", "Invalid RC term found: "+termName, sectionToString(section)); + throw new ParseException(message); + } + // assign notes using term and rank:second section as value + // nasty hack - we really should have notes on the reference itself. + rlistener.addSequenceProperty("docref_" + t.toString(), ref_rank+":"+subparts[1].trim()); + } + } + } + + // create the docref object + try { + List auths = null; + if(authors != null) auths = DocRefAuthor.Tools.parseAuthorString(authors); + if (consortium!=null){ + if(auths == null) auths = new ArrayList(); + auths.add(new SimpleDocRefAuthor(consortium,true,false)); + } + DocRef dr = (DocRef)RichObjectFactory.getObject(SimpleDocRef.class,new Object[]{auths,locator,title}); + + //save all Crossref to the sequence property + if (medline!=null) rlistener.addSequenceProperty("docref_"+"medline", ref_rank+":"+medline); + if (pubmed!=null) rlistener.addSequenceProperty("docref_"+"pubmed", ref_rank+":"+pubmed); + if (doi!=null) rlistener.addSequenceProperty("docref_"+"doi", ref_rank+":"+doi); + // assign either the pubmed or medline to the docref - medline gets priority, then pubmed, then doi +// if (medline!=null) dr.setCrossref((CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{Terms.MEDLINE_KEY, medline, new Integer(0)})); +// else if (pubmed!=null) dr.setCrossref((CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{Terms.PUBMED_KEY, pubmed, new Integer(0)})); +// else if (doi!=null) dr.setCrossref((CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{Terms.DOI_KEY, doi, new Integer(0)})); + // assign the remarks + if (!this.getElideComments()) dr.setRemark(remark); + // assign the docref to the bioentry + RankedDocRef rdr = new SimpleRankedDocRef(dr,rstart,rend,ref_rank); + rlistener.setRankedDocRef(rdr); + } catch (ChangeVetoException e) { + throw new ParseException(e); + } + } else if (sectionKey.equals(COMMENT_TAG) && !this.getElideComments()) { + // Set up some comments + String val = ((String[])section.get(0))[1]; + if (UniProtCommentParser.isParseable(val)) rlistener.setComment(val); + else { + // copyright message + rlistener.addSequenceProperty(Terms.getCopyrightTerm(), val); + } + } else if (sectionKey.equals(FEATURE_TAG) && !this.getElideFeatures()) { + // starting from second line of input, start a new feature whenever we come across + // a key that does not start with / + boolean seenAFeature = false; + for (int i = 1 ; i < section.size(); i++) { + String key = ((String[])section.get(i))[0]; + String val = ((String[])section.get(i))[1]; + val = val.replaceAll("\\s*[\\n\\r]+\\s*", " ").trim(); + if (val.endsWith(".")) val = val.substring(0,val.length()-1); // chomp dot + if (key.startsWith("/")) { + key = key.substring(1); // strip leading slash + if (key.equals("FTId")) rlistener.addFeatureProperty(Terms.getFTIdTerm(),val); + else { + // should never happen - but here just in case + rlistener.addFeatureProperty(RichObjectFactory.getDefaultOntology().getOrCreateTerm(key),val); + } + } else { + // new feature! + // end previous feature + if (seenAFeature) rlistener.endFeature(); + // start next one, with lots of lovely info in it + RichFeature.Template templ = new RichFeature.Template(); + templ.annotation = new SimpleRichAnnotation(); + templ.sourceTerm = Terms.getUniProtTerm(); + templ.typeTerm = RichObjectFactory.getDefaultOntology().getOrCreateTerm(key); + templ.featureRelationshipSet = new TreeSet(); + templ.rankedCrossRefs = new TreeSet(); + String desc = null; + Matcher m = fp.matcher(val); + if (m.matches()) { + String loc = m.group(1); + desc = m.group(3); + templ.location = UniProtLocationParser.parseLocation(loc); + } else { + String message = ParseException.newMessage(this.getClass(),accession, "", "Bad feature value: "+val, sectionToString(section)); + throw new ParseException(message); + } + rlistener.startFeature(templ); + if (desc!=null && desc.length()>0) rlistener.addFeatureProperty(Terms.getFeatureDescTerm(),desc); + seenAFeature = true; + } + } + if (seenAFeature) rlistener.endFeature(); + } else if (sectionKey.equals(START_SEQUENCE_TAG) && !this.getElideSymbols()) { + StringBuffer seq = new StringBuffer(); + + for (int i = 0 ; i < section.size()-1; i++) seq.append(((String[])section.get(i))[1]); + String seqMetaInfo = ((String[])section.get(section.size()-1))[1]; + rlistener.addSequenceProperty(Terms.getSequenceMetaInfoTerm(), seqMetaInfo); + //section size greater than 1? + try { + SymbolList sl = new SimpleSymbolList(symParser, + seq.toString().replaceAll("\\s+","").replaceAll("[\\.|~]","-")); + rlistener.addSymbols(symParser.getAlphabet(), + (Symbol[])(sl.toList().toArray(new Symbol[0])), + 0, sl.length()); + } catch (IllegalAlphabetException e) { + String message = ParseException.newMessage(this.getClass(),accession, "", "", sectionToString(section)); + throw new ParseException(e, message); + } + } + } while (!sectionKey.equals(END_SEQUENCE_TAG)); + }catch (RuntimeException e){ + String message = ParseException.newMessage(this.getClass(),accession, "", "", sectionToString(section)); + throw new ParseException(e, message); + } + + // Allows us to tolerate trailing whitespace without + // thinking that there is another Sequence to follow + while (true) { + reader.mark(1); + int c = reader.read(); + if (c == -1) { + hasAnotherSequence = false; + break; + } + if (Character.isWhitespace((char) c)) { + //hasInternalWhitespace = true; + continue; + } + //if (hasInternalWhitespace) + //System.err.println("Warning: whitespace found between sequence entries"); + reader.reset(); + break; + } + + // Finish up. + rlistener.endSequence(); + return hasAnotherSequence; + } + + // reads an indented section, combining split lines and creating a list of key->value tuples + private List readSection(BufferedReader br) throws ParseException { + List section = new ArrayList(); + String line; + boolean done = false; + + // while not done + try { + while (!done) { + // mark buffer + br.mark(320); + // read token + line = br.readLine(); + if (line.length()<2) { + String message = ParseException.newMessage(this.getClass(),accession, "", "Bad line found: "+line, sectionToString(section)); + throw new ParseException(message); + } + String token = line.substring(0,2); + // READ SEQUENCE SECTION + if (token.equals(START_SEQUENCE_TAG)) { + // from next line, read sequence until // - leave // on stack + StringBuffer sb = new StringBuffer(); + String sequence_meta_info = line.substring(5); + while (!done) { + br.mark(160); + line = br.readLine(); + if (line.startsWith(END_SEQUENCE_TAG)) { + br.reset(); + done = true; + } else { + // create sequence tag->value pair to return, sans numbers + sb.append(line); + } + } + section.add(new String[]{START_SEQUENCE_TAG,sb.toString()}); + section.add(new String[]{"Sequence_Meta_Info", sequence_meta_info}); + } + // READ COMMENT SECTION + else if (token.equals(COMMENT_TAG)) { + // read from first line till next that begins with "CC -!-" + StringBuffer currentVal = new StringBuffer(); + boolean wasMisc = false; + if (!line.startsWith(COMMENT_TAG+" -!-")) wasMisc = true; + currentVal.append(line.substring(5)); + while (!done) { + br.mark(160); + line = br.readLine(); + if (((!wasMisc) && line.charAt(5)!=' ') || !line.startsWith("C") || line.startsWith(COMMENT_TAG+" -!-")) { + br.reset(); + done = true; + // dump current tag if exists + section.add(new String[]{COMMENT_TAG,currentVal.toString()}); + } else { + currentVal.append("\n"); + currentVal.append(line.substring(5)); + } + } + } + // READ FEATURE TABLE SECTION + else if (token.equals(FEATURE_TAG)) { + br.reset(); + // read all FT lines until first non-FT starting line + String currentTag = null; + StringBuffer currentVal = new StringBuffer(); + section.add(new String[]{FEATURE_TAG,null}); + while (!done) { + br.mark(160); + line = br.readLine(); + if (!line.startsWith(FEATURE_TAG)) { + br.reset(); + done = true; + // dump current tag if exists + if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()}); + } else { + // FT lines: FT KEY_NAME x x description + // or: FT .... + // or FT /FTId=899. + line = line.substring(5); // chomp off "FT " + if (!line.startsWith(" ")) { + // dump current tag if exists + if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()}); + // case 1 : word value - splits into key-value based on first 8 chars + currentTag = line.substring(0,8).trim(); + currentVal = new StringBuffer(); + currentVal.append(line.substring(8).trim()); + } else { + line = line.trim(); + if (line.startsWith("/") && line.indexOf("=") != -1) { + // dump current tag if exists + if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()}); + // case 3 : /word=..... + currentVal = new StringBuffer(); + int equalIndex = line.indexOf('='); + if (equalIndex>=0) { + currentTag = line.substring(0, equalIndex); + currentVal.append(line.substring(equalIndex+1)); + } else { + currentTag = line; + } + } else { + // case 2 : ...." + currentVal.append("\n"); + currentVal.append(line); + } + } + } + } + } + // READ DOCREF + else if (token.equals(DATABASE_XREF_TAG)) { + section.add(new String[]{DATABASE_XREF_TAG,line.substring(5).trim()}); + done = true; + } + // READ DATE + else if (token.equals(DATE_TAG)) { + section.add(new String[]{DATE_TAG,line.substring(5).trim()}); + done = true; + } + // READ END OF SEQUENCE + else if (token.equals(END_SEQUENCE_TAG)) { + section.add(new String[]{END_SEQUENCE_TAG,null}); + done = true; + } + // READ NORMAL TAG/VALUE SECTION + else { + // rewind buffer to mark + br.reset(); + // read token/values until first with non-same first character + // exceptions: DE/DT, and RN...RN + String currentTag = null; + char currentTagStart = '\0'; + StringBuffer currentVal = null; + while (!done) { + br.mark(320); + line = br.readLine(); + if (currentTagStart=='\0') currentTagStart = line.charAt(0); + if (!line.startsWith(""+currentTagStart) || + (currentTagStart=='D' && currentTag!=null && !line.startsWith(""+currentTag)) || + (currentTagStart=='R' && currentTag!=null && line.startsWith("RN"))) { + br.reset(); + done = true; + // dump current tag if exists + if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()}); + } else { + try { + // merge neighbouring repeated tokens by concatting values + // return tag->value pairs + String tag = line.substring(0,2); + String value = line.substring(5); + if (currentTag==null || !tag.equals(currentTag)) { + // dump current tag if exists + if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()}); + // start new tag + currentTag = tag; + currentVal = new StringBuffer(); + currentVal.append(value); + } else { + currentVal.append("\n"); + currentVal.append(value); + } + } catch (Exception e) { + String message = ParseException.newMessage(this.getClass(),accession, "", "", sectionToString(section)); + throw new ParseException(e, message); + } + } + } + } + } + } catch (IOException e) { + String message = ParseException.newMessage(this.getClass(),accession, "", "", sectionToString(section)); + throw new ParseException(e, message); + } catch (RuntimeException e){ + String message = ParseException.newMessage(this.getClass(),accession, "", "", sectionToString(section)); + throw new ParseException(e, message); + } + return section; + } + + /** + * {@inheritDoc} + */ + public void writeSequence(Sequence seq, PrintStream os) throws IOException { + if (this.getPrintStream()==null) this.setPrintStream(os); + this.writeSequence(seq, RichObjectFactory.getDefaultNamespace()); + } + + /** + * {@inheritDoc} + */ + public void writeSequence(Sequence seq, String format, PrintStream os) throws IOException { + if (this.getPrintStream()==null) this.setPrintStream(os); + if (!format.equals(this.getDefaultFormat())) throw new IllegalArgumentException("Unknown format: "+format); + this.writeSequence(seq, RichObjectFactory.getDefaultNamespace()); + } + + /** + * {@inheritDoc} + * Namespace is ignored as UniProt has no concept of it. + */ + public void writeSequence(Sequence seq, Namespace ns) throws IOException { + RichSequence rs; + try { + if (seq instanceof RichSequence) rs = (RichSequence)seq; + else rs = RichSequence.Tools.enrich(seq); + } catch (ChangeVetoException e) { + IOException e2 = new IOException("Unable to enrich sequence"); + e2.initCause(e); + throw e2; + } + + SymbolTokenization tok; + try { + tok = rs.getAlphabet().getTokenization("token"); + } catch (Exception e) { + throw new RuntimeException("Unable to get alphabet tokenizer",e); + } + + Set notes = rs.getNoteSet(); + String accession = rs.getAccession(); + StringBuffer accessions = new StringBuffer(); + accessions.append(accession); + accessions.append(";"); + String cdat = null; + String udat = null; + String adat = null; + String dbname = "?"; + String arel = null; + String organelle = null; + String protExists = null; + String dataclass = "STANDARD"; + String copyright = null; + Map speciesRecs = new TreeMap(); + Map strainRecs = new TreeMap(); + Map tissueRecs = new TreeMap(); + Map transpRecs = new TreeMap(); + Map plasmidRecs = new TreeMap(); + Map genenames = new TreeMap(); + Map genesynonyms = new TreeMap(); + Map orfnames = new TreeMap(); + Map ordlocnames = new TreeMap(); + for (Iterator i = notes.iterator(); i.hasNext(); ) { + Note n = i.next(); + if (n.getTerm().equals(Terms.getDateCreatedTerm())) cdat=n.getValue(); + else if (n.getTerm().equals(Terms.getDateUpdatedTerm())) udat=n.getValue(); + else if (n.getTerm().equals(Terms.getDateAnnotatedTerm())) adat=n.getValue(); + else if (n.getTerm().equals(Terms.getUniProtDBNameTerm())) dbname=n.getValue(); + else if (n.getTerm().equals(Terms.getProteinExistsTerm())) protExists=n.getValue(); + else if (n.getTerm().equals(Terms.getRelAnnotatedTerm())) arel=n.getValue(); + else if (n.getTerm().equals(Terms.getDataClassTerm())) dataclass = n.getValue(); + else if (n.getTerm().equals(Terms.getCopyrightTerm())) copyright = n.getValue(); + else if (n.getTerm().equals(Terms.getAdditionalAccessionTerm())) { + accessions.append(" "); + accessions.append(n.getValue()); + accessions.append(";"); + } else if (n.getTerm().equals(Terms.getOrganelleTerm())) organelle = (organelle==null?"":organelle+"; ")+n.getValue(); + // use the nasty hack to split the reference rank away from the actual value in this field + else if (n.getTerm().equals(Terms.getGeneNameTerm())) { + String ref = n.getValue(); + int colon = ref.indexOf(':'); + Integer refID = new Integer(0); + if (colon>=1) refID = new Integer(ref.substring(0,colon)); + genenames.put(refID, ref.substring(colon+1)); // map of id -> string as only one name per gene + } else if (n.getTerm().equals(Terms.getGeneSynonymTerm())) { + String ref = n.getValue(); + int colon = ref.indexOf(':'); + Integer refID = new Integer(0); + if (colon>=1) refID = new Integer(ref.substring(0,colon)); + if (genesynonyms.get(refID)==null) genesynonyms.put(refID, new ArrayList()); + ((List)genesynonyms.get(refID)).add(ref.substring(colon+1)); + } else if (n.getTerm().equals(Terms.getOrderedLocusNameTerm())) { + String ref = n.getValue(); + int colon = ref.indexOf(':'); + Integer refID = new Integer(0); + if (colon>=1) refID = new Integer(ref.substring(0,colon)); + if (ordlocnames.get(refID)==null) ordlocnames.put(refID, new ArrayList()); + ((List)ordlocnames.get(refID)).add(ref.substring(colon+1)); + } else if (n.getTerm().equals(Terms.getORFNameTerm())) { + String ref = n.getValue(); + int colon = ref.indexOf(':'); + Integer refID = new Integer(0); + if (colon>=1) refID = new Integer(ref.substring(0,colon)); + if (orfnames.get(refID)==null) orfnames.put(refID, new ArrayList()); + ((List)orfnames.get(refID)).add(ref.substring(colon+1)); + } + // use the nasty hack to split the reference rank away from the actual value in this field + // we'll end up with a bunch in key 0 for those which did not come from us. We ignore these for now. + else if (n.getTerm().equals(Terms.getSpeciesTerm())) { + String ref = n.getValue(); + int colon = ref.indexOf(':'); + Integer refID = new Integer(0); + if (colon>=1) refID = new Integer(ref.substring(0,colon)); + if (speciesRecs.get(refID)==null) speciesRecs.put(refID, new ArrayList()); + ((List)speciesRecs.get(refID)).add(ref.substring(colon+1)); + } else if (n.getTerm().equals(Terms.getStrainTerm())) { + String ref = n.getValue(); + int colon = ref.indexOf(':'); + Integer refID = new Integer(0); + if (colon>=1) refID = new Integer(ref.substring(0,colon)); + if (strainRecs.get(refID)==null) strainRecs.put(refID, new ArrayList()); + ((List)strainRecs.get(refID)).add(ref.substring(colon+1)); + } else if (n.getTerm().equals(Terms.getTissueTerm())) { + String ref = n.getValue(); + int colon = ref.indexOf(':'); + Integer refID = new Integer(0); + if (colon>=1) refID = new Integer(ref.substring(0,colon)); + if (tissueRecs.get(refID)==null) tissueRecs.put(refID, new ArrayList()); + ((List)tissueRecs.get(refID)).add(ref.substring(colon+1)); + } else if (n.getTerm().equals(Terms.getTransposonTerm())) { + String ref = n.getValue(); + int colon = ref.indexOf(':'); + Integer refID = new Integer(0); + if (colon>=1) refID = new Integer(ref.substring(0,colon)); + if (transpRecs.get(refID)==null) transpRecs.put(refID, new ArrayList()); + ((List)transpRecs.get(refID)).add(ref.substring(colon+1)); + } else if (n.getTerm().equals(Terms.getPlasmidTerm())) { + String ref = n.getValue(); + int colon = ref.indexOf(':'); + Integer refID = new Integer(0); + if (colon>=1) refID = new Integer(ref.substring(0,colon)); + if (plasmidRecs.get(refID)==null) plasmidRecs.put(refID, new ArrayList()); + ((List)plasmidRecs.get(refID)).add(ref.substring(colon+1)); + } + } + + // entryname dataclass; [circular] molecule; division; sequencelength BP. + StringBuffer locusLine = new StringBuffer(); + locusLine.append(StringTools.rightPad(rs.getName()+"_"+rs.getDivision(),12)); + locusLine.append(" "); + locusLine.append(StringTools.leftPad(dataclass,19)); + //locusLine.append("; PRT; "); //Uniprot no longer uses the PRT; + locusLine.append("; "); + locusLine.append(StringTools.leftPad(""+rs.length(),11)); + locusLine.append(" AA."); + StringTools.writeKeyValueLine(LOCUS_TAG, locusLine.toString(), 5, this.getLineWidth(), null, LOCUS_TAG, this.getPrintStream()); + + // accession line + StringTools.writeKeyValueLine(ACCESSION_TAG, accessions.toString(), 5, this.getLineWidth(), null, ACCESSION_TAG, this.getPrintStream()); + + // date line + StringTools.writeKeyValueLine(DATE_TAG, (cdat==null?udat:cdat)+", integrated into UniProtKB/"+dbname+".", 5, this.getLineWidth(), null, DATE_TAG, this.getPrintStream()); + StringTools.writeKeyValueLine(DATE_TAG, udat+", sequence version "+rs.getVersion()+".", 5, this.getLineWidth(), null, DATE_TAG, this.getPrintStream()); + StringTools.writeKeyValueLine(DATE_TAG, (adat==null?udat:adat)+", entry version "+(arel==null?"0":arel)+".", 5, this.getLineWidth(), null, DATE_TAG, this.getPrintStream()); + + // definition line + StringTools.writeKeyValueLine(DEFINITION_TAG, rs.getDescription()+".", 5, this.getLineWidth(), null, DEFINITION_TAG, this.getPrintStream()); + + // gene line + for (Iterator i = genenames.keySet().iterator(); i.hasNext(); ) { + Integer geneid = (Integer)i.next(); + String genename = (String)genenames.get(geneid); + List synonyms = (List)genesynonyms.get(geneid); + List orfs = (List)orfnames.get(geneid); + List ordlocs = (List)ordlocnames.get(geneid); + + StringBuffer gnline = new StringBuffer(); + gnline.append(Terms.GENENAME_KEY); + gnline.append("="); + gnline.append(genename); + gnline.append("; "); + + if (synonyms!=null) { + gnline.append(Terms.GENESYNONYM_KEY); + gnline.append("="); + for (Iterator j = synonyms.iterator(); j.hasNext(); ) { + gnline.append((String)j.next()); + if (j.hasNext()) gnline.append(", "); + } + gnline.append("; "); + } + if (ordlocs!=null) { + gnline.append(Terms.ORDLOCNAME_KEY); + gnline.append("="); + for (Iterator j = ordlocs.iterator(); j.hasNext(); ) { + gnline.append((String)j.next()); + if (j.hasNext()) gnline.append(", "); + } + gnline.append("; "); + } + if (orfs!=null) { + gnline.append(Terms.ORFNAME_KEY); + gnline.append("="); + for (Iterator j = orfs.iterator(); j.hasNext(); ) { + gnline.append((String)j.next()); + if (j.hasNext()) gnline.append(", "); + } + gnline.append("; "); + } + + StringTools.writeKeyValueLine(GENE_TAG, gnline.toString(), 5, this.getLineWidth(), null, GENE_TAG, this.getPrintStream()); + + if (i.hasNext()) StringTools.writeKeyValueLine(GENE_TAG, "and", 5, this.getLineWidth(), null, GENE_TAG, this.getPrintStream()); + } + + // source line (from taxon) + // organism line + NCBITaxon tax = rs.getTaxon(); + if (tax!=null) { + StringBuffer source = new StringBuffer(); + source.append(tax.getDisplayName()); + for (Iterator j = tax.getNames(NCBITaxon.SYNONYM).iterator(); j.hasNext(); ) { + source.append(" ("); + source.append((String)j.next()); + source.append(")"); + } + source.append("."); + StringTools.writeKeyValueLine(SOURCE_TAG, source.toString(), 5, this.getLineWidth(), null, SOURCE_TAG, this.getPrintStream()); + if (organelle!=null) StringTools.writeKeyValueLine(ORGANELLE_TAG, organelle+".", 5, this.getLineWidth(), null, ORGANELLE_TAG, this.getPrintStream()); + StringTools.writeKeyValueLine(ORGANISM_TAG, tax.getNameHierarchy(), 5, this.getLineWidth(), null, ORGANISM_TAG, this.getPrintStream()); + StringTools.writeKeyValueLine(TAXON_TAG, "NCBI_TaxID="+tax.getNCBITaxID()+";", 5, this.getLineWidth(), this.getPrintStream()); + } + + // references - rank (bases x to y) + for (Iterator r = rs.getRankedDocRefs().iterator(); r.hasNext(); ) { + RankedDocRef rdr = r.next(); + DocRef d = rdr.getDocumentReference(); + // RN, RP, RC, RX, RG, RA, RT, RL + StringTools.writeKeyValueLine(REFERENCE_TAG, "["+rdr.getRank()+"]", 5, this.getLineWidth(), null, REFERENCE_TAG, this.getPrintStream()); + if (d.getRemark()!=null) + StringTools.writeKeyValueLine(RP_LINE_TAG, d.getRemark()+".", 5, this.getLineWidth(), null, RP_LINE_TAG, this.getPrintStream()); + // Print out ref position if present + if (rdr.getStart()!=null && rdr.getEnd()!=null && d.getRemark()!=null && !rppat.matcher(d.getRemark()).matches()) StringTools.writeKeyValueLine(RP_LINE_TAG, "SEQUENCE OF "+rdr.getStart()+"-"+rdr.getEnd()+".", 5, this.getLineWidth(), null, RP_LINE_TAG, this.getPrintStream()); + // RC lines + StringBuffer rcline = new StringBuffer(); + Integer rank = new Integer(rdr.getRank()); + if (speciesRecs.get(rank)!=null) { + rcline.append(Terms.SPECIES_KEY); + rcline.append("="); + for (Iterator i = ((List)speciesRecs.get(rank)).iterator(); i.hasNext(); ) { + rcline.append((String)i.next()); + if (i.hasNext()) rcline.append(", "); + } + rcline.append("; "); + } + if (strainRecs.get(rank)!=null) { + rcline.append(Terms.STRAIN_KEY); + rcline.append("="); + for (Iterator i = ((List)strainRecs.get(rank)).iterator(); i.hasNext(); ) { + rcline.append((String)i.next()); + if (i.hasNext()) rcline.append(", "); + } + rcline.append("; "); + } + if (tissueRecs.get(rank)!=null) { + rcline.append(Terms.TISSUE_KEY); + rcline.append("="); + for (Iterator i = ((List)tissueRecs.get(rank)).iterator(); i.hasNext(); ) { + rcline.append((String)i.next()); + if (i.hasNext()) rcline.append(", "); + } + rcline.append("; "); + } + if (transpRecs.get(rank)!=null) { + rcline.append(Terms.TRANSPOSON_KEY); + rcline.append("="); + for (Iterator i = ((List)transpRecs.get(rank)).iterator(); i.hasNext(); ) { + rcline.append((String)i.next()); + if (i.hasNext()) rcline.append(", "); + } + rcline.append("; "); + } + if (plasmidRecs.get(rank)!=null) { + rcline.append(Terms.PLASMID_KEY); + rcline.append("="); + for (Iterator i = ((List)plasmidRecs.get(rank)).iterator(); i.hasNext(); ) { + rcline.append((String)i.next()); + if (i.hasNext()) rcline.append(", "); + } + rcline.append("; "); + } + // print the rcline + if (rcline.length()>0) StringTools.writeKeyValueLine(RC_LINE_TAG, rcline.toString(), 5, this.getLineWidth(), null, RC_LINE_TAG, this.getPrintStream()); + // Deal with RX and rest + CrossRef c = d.getCrossref(); + if (c!=null) StringTools.writeKeyValueLine(REFERENCE_XREF_TAG, c.getDbname()+"="+c.getAccession()+";", 5, this.getLineWidth(), null, REFERENCE_XREF_TAG, this.getPrintStream()); + List auths = d.getAuthorList(); + for (Iterator j = auths.iterator(); j.hasNext(); ) { + DocRefAuthor a = j.next(); + if (a.isConsortium()) { + StringTools.writeKeyValueLine(CONSORTIUM_TAG, a.getName()+";", 5, this.getLineWidth(), null, CONSORTIUM_TAG, this.getPrintStream()); + j.remove(); + } + } + if (!auths.isEmpty()) StringTools.writeKeyValueLine(AUTHORS_TAG, DocRefAuthor.Tools.generateAuthorString(auths, false)+";", 5, this.getLineWidth(), null, AUTHORS_TAG, this.getPrintStream()); + if (d.getTitle()!=null && d.getTitle().length()!=0) StringTools.writeKeyValueLine(TITLE_TAG, "\""+d.getTitle()+"\";", 5, this.getLineWidth(), null, TITLE_TAG, this.getPrintStream()); + StringTools.writeKeyValueLine(LOCATION_TAG, d.getLocation()+".", 5, this.getLineWidth(), null, LOCATION_TAG, this.getPrintStream()); + } + + // comments - if any + if (!rs.getComments().isEmpty()) { + for (Iterator i = rs.getComments().iterator(); i.hasNext(); ) { + Comment c = i.next(); + String text = c.getComment().trim(); + if (text.length()>3 && text.substring(0,3).equals("-!-")) StringTools.writeKeyValueLine(COMMENT_TAG, text, 5, this.getLineWidth(), null, COMMENT_TAG, this.getPrintStream()); + else StringTools.writeKeyValueLine(COMMENT_TAG, text, 5, this.getLineWidth(), null, COMMENT_TAG, this.getPrintStream()); + } + } + + // copyright - if any + if (copyright!=null) + StringTools.writeKeyValueLine(COMMENT_TAG, copyright, 5, this.getLineWidth(), null, COMMENT_TAG, this.getPrintStream()); + + // db references - ranked + for (Iterator r = rs.getRankedCrossRefs().iterator(); r.hasNext(); ) { + RankedCrossRef rcr = r.next(); + CrossRef c = rcr.getCrossRef(); + Set noteset = c.getNoteSet(); + StringBuffer sb = new StringBuffer(); + sb.append(c.getDbname()); + sb.append("; "); + sb.append(c.getAccession()); + boolean hasSecondary = false; + for (Iterator i = noteset.iterator(); i.hasNext(); ) { + Note n = i.next(); + if (n.getTerm().equals(Terms.getAdditionalAccessionTerm())) { + sb.append("; "); + sb.append(n.getValue()); + hasSecondary = true; + } + } + if (!hasSecondary) sb.append("; -"); + sb.append("."); + StringTools.writeKeyValueLine(DATABASE_XREF_TAG, sb.toString(), 5, this.getLineWidth(), null, DATABASE_XREF_TAG, this.getPrintStream()); + } + + // protein exists line + if (protExists!=null) { + StringTools.writeKeyValueLine(PROTEIN_EXIST_TAG, protExists+";", 5, this.getLineWidth(), null, PROTEIN_EXIST_TAG, this.getPrintStream()); + } + + // keywords line + String keywords = null; + for (Iterator n = notes.iterator(); n.hasNext(); ) { + Note nt = n.next(); + if (nt.getTerm().equals(Terms.getKeywordTerm())) { + if (keywords==null) keywords = nt.getValue(); + else keywords = keywords+"; "+nt.getValue(); + } + } + if (keywords!=null) { + StringTools.writeKeyValueLine(KEYWORDS_TAG, keywords+".", 5, this.getLineWidth(), null, KEYWORDS_TAG, this.getPrintStream()); + } + + // feature_type location + for (Iterator i = rs.getFeatureSet().iterator(); i.hasNext(); ) { + RichFeature f = (RichFeature)i.next(); + String desc = ""; + String ftid = null; + for (Iterator j = f.getNoteSet().iterator(); j.hasNext(); ) { + Note n = j.next(); + if (n.getTerm().equals(Terms.getFTIdTerm())) ftid = n.getValue(); + else if (n.getTerm().equals(Terms.getFeatureDescTerm())) desc = n.getValue(); + } + String kw = f.getTypeTerm().getName(); + String leader = StringTools.rightPad(kw,8)+" "+UniProtLocationParser.writeLocation((RichLocation)f.getLocation()); + if(desc.length()==0) + this.getPrintStream().println(FEATURE_TAG+" "+leader); //see #2277 + else + StringTools.writeKeyValueLine(FEATURE_TAG+" "+leader, desc+".", 34, this.getLineWidth(), null, FEATURE_TAG, this.getPrintStream()); + if (ftid!=null) StringTools.writeKeyValueLine(FEATURE_TAG, "/FTId="+ftid+".", 34, this.getLineWidth(), null, FEATURE_TAG, this.getPrintStream()); + } + + // sequence header + int mw = 0; + try { + mw = (int) MassCalc.getMolecularWeight(rs); + } catch (IllegalSymbolException e) { + throw new RuntimeException("Found illegal symbol", e); + } + CRC64Checksum crc = new CRC64Checksum(); + String seqstr = rs.seqString(); + crc.update(seqstr.getBytes(),0,seqstr.length()); + this.getPrintStream().print(START_SEQUENCE_TAG+" SEQUENCE "+StringTools.leftPad(""+rs.length(),4)+" AA; "); + this.getPrintStream().print(StringTools.leftPad(""+mw,5)+" MW; "); + this.getPrintStream().println(crc+" CRC64;"); + + // sequence stuff + Symbol[] syms = (Symbol[])rs.toList().toArray(new Symbol[0]); + int symCount = 0; + this.getPrintStream().print(" "); + for (int i = 0; i < syms.length; i++) { + if (symCount % 60 == 0 && symCount>0) { + this.getPrintStream().print("\n "); + } + if (symCount % 10 == 0) { + this.getPrintStream().print(" "); + } + try { + this.getPrintStream().print(tok.tokenizeSymbol(syms[i])); + } catch (IllegalSymbolException e) { + throw new RuntimeException("Found illegal symbol: "+syms[i]); + } + symCount++; + } + this.getPrintStream().print("\n"); + this.getPrintStream().println(END_SEQUENCE_TAG); + } + + /** + * {@inheritDoc} + */ + public String getDefaultFormat() { + return UNIPROT_FORMAT; + } + + /** + * Converts the current parse section to a String. Useful for debugging. + */ + String sectionToString(List section){ + StringBuffer parseBlock = new StringBuffer(); + for(Iterator i = section.listIterator(); i.hasNext();){ + String[] part = (String[])i.next(); + for(int x = 0; x < part.length; x++){ + parseBlock.append(part[x]); + if(x == 0){ + parseBlock.append(" "); //the gap will have been trimmed + } + } + } + return parseBlock.toString(); + } +} diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/util/ProcessNew.java b/piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/util/ProcessNew.java new file mode 100644 index 0000000..9a080c9 --- /dev/null +++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/util/ProcessNew.java @@ -0,0 +1,571 @@ +package cn.piflow.bundle.microorganism.util; + + + +import org.biojava.bio.seq.Feature; +import org.biojavax.*; +import org.biojavax.bio.seq.RichFeature; +import org.biojavax.bio.seq.RichSequence; +import org.biojavax.ontology.SimpleComparableTerm; +import org.json.JSONArray; +import org.json.JSONObject; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Created by xiujuan on 2016/3/24. + */ +public class ProcessNew { + + static final Logger logger = LoggerFactory.getLogger(ProcessNew.class); + static final Pattern dp = Pattern.compile("(\\d{4})"); + static final Pattern llp = Pattern.compile("(\\S+)\\s([SN])\\s(\\S+)\\s([WE])"); + static final Pattern submitDatep = Pattern.compile("^Submitted\\s+\\((\\S+)\\)\\s+(.*)$"); + static final SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd"); + static final SimpleDateFormat format = new SimpleDateFormat("dd-MMM-yyyy", Locale.ENGLISH); + + // static AddressCountryDict dict = AddressCountryDict.getInstance(); + + public static HashMap processSingleSequence(RichSequence seq) throws ParseException { + //try{ + // logger.info("doc: " + seq.getAccession()); + + HashMap map = new HashMap() ; + + + map.put("Sequence", seq.seqString()); + map.put("Accession", seq.getAccession()); + + map.put("SequenceLength", seq.getInternalSymbolList().length()); + if (seq.getTaxon() != null) { + map.put("TaxonID", seq.getTaxon().getNCBITaxID()); + map.put("Organism", seq.getTaxon().getDisplayName()); + } + map.put("Description", seq.getDescription().replace('\n', ' ')); + + map.put("Division", seq.getDivision()); + map.put("Identifier", seq.getIdentifier()); + map.put("Version", seq.getVersion()); + + if (seq.getCircular()) { + map.put("Topology", "Circular"); + } else { + map.put("Topology", "Linear"); + } + + + for (Note note : seq.getNoteSet()) { + String noteName = note.getTerm().toString().substring(9); + if (noteName.indexOf("moltype") != -1) { + map.put("MoleculeType", note.getValue()); + } else if (noteName.indexOf("Organism") != -1) { + String organism = note.getValue(); + //doc.put("Organism", organism.substring(0,organism.indexOf("\n"))); + map.put("Lineage", organism.substring(organism.indexOf("\n")).replaceAll("\n", "")); + } else if (noteName.indexOf("acc") != -1) { + map.put("AdditionalAccs", note.getValue()); + } else if (noteName.indexOf("DBLink") != -1) { //deal with dblinks + JSONArray dbLinks = new JSONArray(); + String[] val = note.getValue().split("\\n"); + for (String v : val) { + int index = v.indexOf(":"); + if (index != -1) { + JSONObject link = new JSONObject(); + link.put(v.substring(0, index), v.substring(index + 1).trim()); + dbLinks.put(link); + } else { // value splitted into more than one line + JSONObject last = dbLinks.getJSONObject(dbLinks.length() - 1); + String key = last.keys().next(); + String value = last.get(key).toString(); + String newVal = value + v; + last.put(key, newVal); + } + } + map.put("dbLinks", dbLinks); + } else if (noteName.equals("kw")) { + map.put("KeyWords", note.getValue()); + } else if (noteName.equals("udat")) { + map.put("dateUpdated", formatter.format(format.parse(note.getValue()))); + } else { + map.put(noteName, note.getValue()); + } + } + + //features + JSONArray featureArray = new JSONArray(); + Iterator featureIterator = seq.features(); + List isolates = new ArrayList(); + while (featureIterator.hasNext()) { + JSONObject featureObject = new JSONObject(); + List dbxrefArray = new ArrayList(); + RichFeature feature = (RichFeature) featureIterator.next(); + for (RankedCrossRef rankedCrossRef : feature.getRankedCrossRefs()) { + dbxrefArray.add(rankedCrossRef.getCrossRef().getDbname() + ":" + rankedCrossRef.getCrossRef().getAccession()); + } + featureObject.put("db_xref", dbxrefArray); + + featureObject.put("featureType", feature.getType()); + Map featureMap = feature.getAnnotation().asMap(); + Iterator featureKeyIterator = featureMap.keySet().iterator(); + while (featureKeyIterator.hasNext()) { + SimpleComparableTerm term = featureKeyIterator.next(); + String name = term.getName(); + String nameValue = featureMap.get(term).toString(); + //isolate is an array? + + if (name.indexOf("altitude") != -1) { + featureObject.put("altitude_value", Float.valueOf(nameValue.substring(0, nameValue.indexOf(" ")))); //number, take care of negative number + } else if (name.indexOf("collection_date") != -1) { + if (getCollectionYear(nameValue) != 0) { + featureObject.put("collection_year", getCollectionYear(nameValue)); + } + } else if (name.indexOf("country") != -1) { + if (nameValue.indexOf(":") != -1) { + featureObject.put("CollectionCountry", nameValue.substring(0, nameValue.indexOf(":"))); + } + } else if (name.indexOf("culture_collection") != -1) { + int index = nameValue.indexOf(":") != -1 ? nameValue.indexOf(":") : nameValue.indexOf(" "); + if (index != -1) { + featureObject.put("InstitutionCode", nameValue.substring(0, index)); + featureObject.put("CultureID", nameValue.substring(index + 1)); + } + } else if (name.indexOf("lat_lon") != -1) { + Float[] arr = getLat_Lon(nameValue); + if (arr != null) { + featureObject.put("Latitude", arr[0]); + featureObject.put("Longitude", arr[1]); + } + } else if (name.indexOf("pathovar") != -1) { + + } else if (feature.getType().equals("source") && name.equals("isolate")) { + isolates.add(nameValue); + } + featureObject.put(term.getName(), featureMap.get(term)); + } + featureArray.put(featureObject); + //for garbage collection + featureObject = null; + dbxrefArray = null; + feature = null; + featureMap = null; + } + map.put("features", featureArray); + if (isolates.size() > 0) { + map.put("isolate_all", isolates); + } + return map; + } + + public static int getCollectionYear(String date){ + Matcher m = dp.matcher(date); + String year; + if(m.find()){ + year = m.group(1); + return Integer.parseInt(year); + }else{ + return 0; + } + } + + public static Float[] getLat_Lon(String lat_lon){ + Matcher m = llp.matcher(lat_lon); + Float[] array = null; + try{ + if(m.matches()){ + array = new Float[2]; + if(m.group(2).equals("N")){ + array[0] = Float.valueOf(m.group(1)); + }else{ + array[0] = Float.valueOf("0")-Float.valueOf(m.group(1)); + } + if(m.group(4).equals("E")){ + array[1] = Float.valueOf(m.group(3)); + }else{ + array[1] = Float.valueOf("0")-Float.valueOf(m.group(3)); + } + } + }catch (NumberFormatException nfe){ + return null; + } + return array; + } + + public static void processUniprotSeq(RichSequence seq, JSONObject doc) throws ParseException { + logger.info("doc: " + seq.getAccession()); + doc.put("Accession", seq.getAccession()); + doc.put("Name", seq.getName()); + doc.put("Division", seq.getDivision()); + doc.put("Description", seq.getDescription().replace('\n', ' ')); + doc.put("Version", seq.getVersion()); + doc.put("sequencelength", seq.length()); + //Taxon + doc.put("TaxonID", seq.getTaxon().getNCBITaxID()); + for(Object name: seq.getTaxon().getNameClasses()){ + doc.put("Taxon_"+(String)name, seq.getTaxon().getNames((String)name)); + } + + //rankedcrossrefs + /*JSONArray rankedCrossRefs = new JSONArray(); + for(RankedCrossRef rankedCrossRef : seq.getRankedCrossRefs()){ + JSONObject ref = new JSONObject(); + String key = rankedCrossRef.getCrossRef().getDbname(); + String accessions = rankedCrossRef.getCrossRef().getAccession(); + for(Note note : rankedCrossRef.getCrossRef().getRichAnnotation().getNoteSet()){ + accessions += ";"+note.getValue(); + } + ref.put(key, accessions); + rankedCrossRefs.put(ref); + } + if(rankedCrossRefs.length() > 0){ + doc.put("rankedCrossRefs", rankedCrossRefs); + }*/ + processRankedCrossRefs(seq, doc); + //comments + JSONArray comments = new JSONArray(); + for(Comment comment : seq.getComments()){ + JSONObject cmtObj = new JSONObject(); + String cmt = comment.getComment().replace('\n', ' '); + cmt = cmt.substring(3); + int index = cmt.indexOf(":"); + cmtObj.put(cmt.substring(0,index).trim(),cmt.substring(index+1).trim()); + comments.put(cmtObj); + } + if(comments.length() > 0){ + doc.put("comments", comments); + } + //features + JSONArray features = new JSONArray(); + Iterator featureIterator = seq.features(); + while(featureIterator.hasNext()){ + JSONObject featureObject = new JSONObject(); + List dbxrefArray = new ArrayList(); + RichFeature feature = (RichFeature)featureIterator.next(); + for(RankedCrossRef rankedCrossRef : feature.getRankedCrossRefs()){ + dbxrefArray.add(rankedCrossRef.getCrossRef().getDbname() + ":" + rankedCrossRef.getCrossRef().getAccession()); + } + if(dbxrefArray.size() > 0){ + featureObject.put("rankedCrossRefs", dbxrefArray); + } + featureObject.put("type", feature.getType()); + featureObject.put("location_start", feature.getLocation().getMin()); + featureObject.put("location_end", feature.getLocation().getMax()); + Map featureMap = feature.getAnnotation().asMap(); + Iterator featureKeyIterator = featureMap.keySet().iterator(); + while(featureKeyIterator.hasNext()){ + SimpleComparableTerm term = featureKeyIterator.next(); + featureObject.put(term.getName(),featureMap.get(term)); + } + features.put(featureObject); + } + if(features.length() > 0){ + doc.put("features", features); + } + //sequence + doc.put("sequence", seq.seqString()); + + JSONArray rankedDocRefs = new JSONArray(); + Map> rankedDocRefs_addiInfo = new HashMap>(); + //properties from notes: rlistener.addSequenceProperty + List keywords = new ArrayList(); + List secondaryAccs = new ArrayList(); + JSONArray organismHosts = new JSONArray(); + for(Note note : seq.getNoteSet()){ + String note_term = note.getTerm().getName(); + if(note_term.equals("kw")){ + keywords.add(note.getValue()); + }else if(note_term.equals("cdat")){ + doc.put("dateCreated", formatter.format(format.parse(note.getValue()))); + }else if(note_term.equals("udat")){ + doc.put("dateUpdated", formatter.format(format.parse(note.getValue()))); + }else if(note_term.equals("adat")){ + doc.put("dateAnnotated", formatter.format(format.parse(note.getValue()))); + }else if(note_term.equals("arel")){ + doc.put("relAnnotated", note.getValue()); + }else if(note_term.equals("Organism host")){ + JSONObject organismHost = new JSONObject(); + String sciname; + String comname; + String names = null; + List synonym = new ArrayList(); + String[] parts = note.getValue().split(";"); + if(parts[0].matches("\\S+=\\S+")){ + String[] moreparts = parts[0].split("="); + if(moreparts[0].equals("NCBI_TaxID")){ + organismHost.put("NCBI_TaxID",Integer.parseInt(moreparts[1])); + }else{ + organismHost.put(moreparts[0],moreparts[1]); + } + }else{ + names = parts[0]; + } + if(parts.length > 1){ + names = parts[1]; + } + if(names != null){ + if (names.endsWith(".")) names = names.substring(0,names.length()-1); // chomp trailing dot + String[] nameparts = names.split("\\("); + sciname = nameparts[0].trim(); + organismHost.put("scientific name", sciname); + if (nameparts.length>1) { + comname = nameparts[1].trim(); + if (comname.endsWith(")")) comname = comname.substring(0,comname.length()-1); // chomp trailing bracket + organismHost.put("common name", comname); + if (nameparts.length>2) { + // synonyms + for (int j = 2 ; j < nameparts.length; j++) { + String syn = nameparts[j].trim(); + if (syn.endsWith(")")) syn = syn.substring(0,syn.length()-1); // chomp trailing bracket + synonym.add(syn); + } + organismHost.put("synonym", synonym); + } + } + } + organismHosts.put(organismHost); + }else if(note_term.equals("Sequence meta info")){ + String seqMetaInfo = note.getValue(); + if(seqMetaInfo.startsWith("SEQUENCE")){ + seqMetaInfo = seqMetaInfo.substring(8); + } + String[] parts = seqMetaInfo.split(";"); + if(parts.length > 1){ + doc.put("molecular weight", Integer.parseInt(parts[1].trim().split(" ")[0])); + if(parts.length > 2){ + String[] moreparts = parts[2].trim().split(" "); + doc.put(moreparts[1], moreparts[0]); + } + } + }else if(note_term.startsWith("docref")){ + int rank = Integer.parseInt(note.getValue().split(":")[0].trim()); + String key = note_term.substring(7); //remove the precedding "docref_" + if(key.contains("biojavax:")){ + key = key.substring(9); //remove "biojavax:" + } + String value = note.getValue().substring(note.getValue().indexOf(":")+1).trim(); + if(rankedDocRefs_addiInfo.containsKey(rank)){ + rankedDocRefs_addiInfo.get(rank).add(key+":"+value); + }else{ + List tmp = new ArrayList(); + tmp.add( key+":"+value); + rankedDocRefs_addiInfo.put(rank,tmp); + } + }else if(note_term.equals("acc")){ + secondaryAccs.add(note.getValue()); + }else{ + doc.put(note_term, note.getValue()); + } + } + if(secondaryAccs.size() > 0){ + doc.put("secondaryacc",secondaryAccs); + } + if(organismHosts.length() > 0){ + doc.put("organismhost", organismHosts); + } + if(keywords.size() > 0){ + doc.put("keywords", keywords); + } + + //rankeddocref + for(RankedDocRef rankedDocRef : seq.getRankedDocRefs()){ + JSONObject rankedDocRefObj = new JSONObject(); + DocRef docRef = rankedDocRef.getDocumentReference(); + rankedDocRefObj.put("rank", rankedDocRef.getRank()); + rankedDocRefObj.put("authors", docRef.getAuthors()); + rankedDocRefObj.put("title", docRef.getTitle()); + rankedDocRefObj.put("location", docRef.getLocation()); + rankedDocRefObj.put("remark", docRef.getRemark()); + for(Map.Entry entry : rankedDocRefs_addiInfo.entrySet()){ + if((Integer)(entry.getKey()) == rankedDocRef.getRank()){ + for(String pair : (List)(entry.getValue())){ + int index = pair.indexOf(":"); + rankedDocRefObj.put(pair.substring(0, index),pair.substring(index+1)); + } + } + } + rankedDocRefs.put(rankedDocRefObj); + } + if(rankedDocRefs.length() > 0){ + doc.put("rankedDocRefs", rankedDocRefs); + } + } + + public static void processEMBL_EnsemblSeq(RichSequence seq,JSONObject doc) throws ParseException { + logger.info("accession: " + seq.getName()); + if(seq.getCircular()){ + doc.put("Topology", "Circular"); + }else{ + doc.put("Topology", "Linear"); + } + for(Note note : seq.getNoteSet()){ + String noteName = note.getTerm().toString().substring(9); + if(noteName.equals("moltype")){ + doc.put("Molecule type", note.getValue()); + }else if(noteName.equals("organism")){ + doc.put("Classfication", note.getValue().replaceAll("\n", "")); + }else if(noteName.equals("kw")){ + doc.put("KeyWords", note.getValue()); + }else if(noteName.equals("udat")){ + doc.put("dateUpdated", formatter.format(format.parse(note.getValue()))); + }else if(noteName.equals("cdat")){ + doc.put("dateCreated", formatter.format(format.parse(note.getValue()))); + }else{ + doc.put(noteName, note.getValue()); + } + } + doc.put("SequenceLength", seq.getInternalSymbolList().length()); + doc.put("Description", seq.getDescription().replace('\n', ' ')); + //System.out.println(seq.getInternalSymbolList().length()); + //doc.put("Sequence length", seq.getInternalSymbolList().length()); + doc.put("Accession", seq.getName()); + doc.put("Organism",seq.getTaxon().getDisplayName()); + doc.put("TaxonID", seq.getTaxon().getNCBITaxID()); + + /*for (RankedDocRef rankDocRef : seq.getRankedDocRefs()){ + if(rankDocRef.getDocumentReference().getLocation().indexOf("Submitted") != -1){ + int dotindex = rankDocRef.getDocumentReference().getLocation().indexOf("."); + String submitDate = rankDocRef.getDocumentReference().getLocation().substring(11,22); + String submitAddress = rankDocRef.getDocumentReference().getLocation().substring(dotindex+1).trim(); + doc.put("SubmitDate", format.parse(submitDate)); + doc.put("SubmittedAddress", rankDocRef.getDocumentReference().getLocation().substring(dotindex+1).trim()); + } + }*/ + //rankedDocRefs + //processRankedDocRefs(seq, doc); + + //rankedCrossRef + processRankedCrossRefs(seq, doc); + + //comments + processComment(seq, doc); + + //features + JSONArray featureArray = new JSONArray(); + Iterator featureIterator = seq.features(); + while (featureIterator.hasNext()){ + JSONObject featureObject = new JSONObject(); + List dbxrefArray = new ArrayList(); + RichFeature feature = (RichFeature)featureIterator.next(); + //deal with db_xref in each feature + //db_xref is not required in the requirement + for(RankedCrossRef rankedCrossRef : feature.getRankedCrossRefs()){ + dbxrefArray.add(rankedCrossRef.getCrossRef().getDbname() + ":" + rankedCrossRef.getCrossRef().getAccession()); + } + featureObject.put("db_xref", dbxrefArray); + + featureObject.put("featureType", feature.getType()); + Map featureMap = feature.getAnnotation().asMap(); + Iterator featureKeyIterator = featureMap.keySet().iterator(); + while(featureKeyIterator.hasNext()){ + SimpleComparableTerm term = featureKeyIterator.next(); + String name = term.getName(); + String nameValue = featureMap.get(term).toString(); + + if(name.equals("altitude")){ + featureObject.put("altitude_value", Float.valueOf(nameValue.substring(0,nameValue.indexOf("m")).trim())); //number, take care of negative number + }else if(name.equals("collection_date")){ + JSONArray collectionDates = new JSONArray(); + for(String singleDate : nameValue.split("/")){ + JSONObject collectionDate = new JSONObject(); + if(singleDate.endsWith("FT")){ + singleDate = singleDate.substring(0, singleDate.length()-2); + } + if(singleDate.matches("\\d{2}-\\w{3}-\\d{4}")){ + collectionDate.put("collection_date", formatter.format(format.parse(singleDate))); + }else{ + collectionDate.put("collection_date", singleDate); + } + + collectionDate.put("collection_year", getCollectionYear(singleDate)); + collectionDates.put(collectionDate); + } + featureObject.put("collectionDate", collectionDates); + } + featureObject.put(term.getName(),featureMap.get(term)); + } + featureArray.put(featureObject); + } + doc.put("features", featureArray); + } + + public static void processRankedCrossRefs(RichSequence seq, JSONObject doc){ + JSONArray rankedCrossRefs = new JSONArray(); + for(RankedCrossRef rankedCrossRef : seq.getRankedCrossRefs()){ + JSONObject ref = new JSONObject(); + String key = rankedCrossRef.getCrossRef().getDbname(); + String accessions = rankedCrossRef.getCrossRef().getAccession(); + for(Note note : rankedCrossRef.getCrossRef().getRichAnnotation().getNoteSet()){ + accessions += ";"+note.getValue(); + } + ref.put(key, accessions); + rankedCrossRefs.put(ref); + } + if(rankedCrossRefs.length() > 0){ + doc.put("rankedCrossRefs", rankedCrossRefs); + } + } + +// public static void processRankedDocRefs(RichSequence seq, JSONObject doc) throws ParseException { +// JSONArray rankedDocRefs = new JSONArray(); +// for(RankedDocRef rankedDocRef : seq.getRankedDocRefs()){ +// DocRef docRef = rankedDocRef.getDocumentReference(); +// JSONObject rankedRef = new JSONObject(); +// rankedRef.put("authors", docRef.getAuthors()); +// rankedRef.put("title", docRef.getTitle()); +// if(docRef.getCrossref() != null){ +// String dbName = docRef.getCrossref().getDbname(); +// if(dbName.equals("PUBMED")){ +// rankedRef.put(dbName, Integer.parseInt(docRef.getCrossref().getAccession())); +// }else{ +// rankedRef.put(dbName, docRef.getCrossref().getAccession()); +// } +// } +// Matcher m = submitDatep.matcher(docRef.getLocation().replaceAll("\n", " ")); +// if(m.matches()){ +// rankedRef.put("SubmitDate", formatter.format(format.parse(m.group(1)))); +// rankedRef.put("SubmitAddress", m.group(2)); +// int year = Integer.parseInt(m.group(1).substring(m.group(1).lastIndexOf("-")+1)); +// rankedRef.put("SubmitYear", year); +// //submitCountry--extract from SubmitAddress +// String countryName = dict.mappingCountry(m.group(2)); +// if(countryName != null){ +// rankedRef.put("SubmitCountry", countryName); +// } +// } +// rankedDocRefs.put(rankedRef); +// } +// doc.put("rankedDocRefs", rankedDocRefs); +// } + + public static void processComment(RichSequence seq, JSONObject doc){ + Map commentMetaData = new HashMap(); + JSONArray comments = new JSONArray(); + for(Comment comment: seq.getComments()){ + JSONObject commentObj = new JSONObject(); + if(comment.getComment().indexOf("::") != -1){ + String comm[] = comment.getComment().split("\n"); + for(int i = 0; i < comm.length; i++){ + if(comm[i].matches("(.*)\\s+::\\s+(.*)")){ + String[] metaData = comm[i].split("::"); + String key = metaData[0].trim(); + String value = metaData[1].trim(); + if(key.contains(".")){ + key = key.replaceAll("\\.", " "); + } + commentMetaData.put(key, value); + } + } + commentObj.put("commentMeta", commentMetaData); + }else{ + commentObj.put("comment", comment.getComment()); + } + comments.put(commentObj); + } + doc.put("comments", comments); + } +} diff --git a/piflow-bundle/src/test/scala/cn/piflow/bundle/ftp/emblTest.scala b/piflow-bundle/src/test/scala/cn/piflow/bundle/ftp/emblTest.scala new file mode 100644 index 0000000..e4d1626 --- /dev/null +++ b/piflow-bundle/src/test/scala/cn/piflow/bundle/ftp/emblTest.scala @@ -0,0 +1,87 @@ +package cn.piflow.bundle.ftp + +import cn.piflow.Runner +import cn.piflow.conf.bean.FlowBean +import cn.piflow.conf.util.{FileUtil, OptionUtil} +import org.apache.spark.sql.SparkSession +import org.h2.tools.Server +import org.jsoup.Jsoup +import org.jsoup.select.Elements +import org.junit.Test + +import scala.util.parsing.json.JSON + +class emblTest { + + @Test + def testEmblDataParse(): Unit ={ + + //parse flow json +// val file = "src/main/resources/yqd/down.json" +//val file = "src/main/resources/yqd/refseq_genome.json" +//val file = "src/main/resources/yqd/select_unzip.json" +val file = "src/main/resources/yqd/embl_parser.json" + + val flowJsonStr = FileUtil.fileReader(file) + + val map = OptionUtil.getAny(JSON.parseFull(flowJsonStr)).asInstanceOf[Map[String, Any]] + println(map) + + //create flow + val flowBean = FlowBean(map) + val flow = flowBean.constructFlow() + + val h2Server = Server.createTcpServer("-tcp", "-tcpAllowOthers", "-tcpPort","50001").start() + //execute flow + val spark = SparkSession.builder() + .master("spark://10.0.88.70:7077") + .appName("Embl") + .config("spark.driver.memory", "8g") + .config("spark.executor.memory", "16g") + .config("spark.cores.max", "16") + .config("spark.jars","/root/Desktop/weishengwu/out/artifacts/piflow_bundle/piflow_bundle.jar") + .enableHiveSupport() + .getOrCreate() + + val process = Runner.create() + .bind(classOf[SparkSession].getName, spark) + .bind("checkpoint.path", "hdfs://10.0.86.89:9000/xjzhu/piflow/checkpoints/") + .start(flow); + + process.awaitTermination(); + val pid = process.pid(); + println(pid + "!!!!!!!!!!!!!!!!!!!!!") + spark.close(); + } + + + @Test + def testEmblDataParse11(): Unit ={ + + val url ="http://ftp.ebi.ac.uk/pub/databases/ena/sequence/release/" + val doc = Jsoup.connect(url).timeout(100000000).get() + // 获取 url 界面 文件名字 日期 大小 + // Name Last modified Size Parent Directory - + // build_gbff_cu.pl 2003-04-25 17:23 21K + + val elements: Elements = doc.select("html >body >table >tbody") +// println(elements) + println(elements.first().text()) + + // 按行 分割 elements 为单个字符串 + val fileString = elements.first().text().split("\\n") + + + for (i <- 0 until fileString.size) { + + println(fileString(i)) + } + + println(fileString) + } + + + + + +} From dc652eb798fdd9b972025d1310f015a3321e9d07 Mon Sep 17 00:00:00 2001 From: judy0131 Date: Mon, 24 Dec 2018 11:23:53 +0800 Subject: [PATCH 2/3] Create doc --- doc | 1 + 1 file changed, 1 insertion(+) create mode 100644 doc diff --git a/doc b/doc new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/doc @@ -0,0 +1 @@ + From 01e6a5c3f445508439f367bfe5a334c43524e065 Mon Sep 17 00:00:00 2001 From: judy0131 Date: Mon, 24 Dec 2018 11:24:13 +0800 Subject: [PATCH 3/3] Delete doc --- doc | 1 - 1 file changed, 1 deletion(-) delete mode 100644 doc diff --git a/doc b/doc deleted file mode 100644 index 8b13789..0000000 --- a/doc +++ /dev/null @@ -1 +0,0 @@ -