diff --git a/piflow-bundle/src/main/resources/microorganism/EMBL_Logo.svg b/piflow-bundle/src/main/resources/microorganism/EMBL_Logo.svg
new file mode 100644
index 0000000..20959c9
--- /dev/null
+++ b/piflow-bundle/src/main/resources/microorganism/EMBL_Logo.svg
@@ -0,0 +1,76 @@
+
diff --git a/piflow-bundle/src/main/resources/microorganism/down.json b/piflow-bundle/src/main/resources/microorganism/down.json
new file mode 100644
index 0000000..8c60db1
--- /dev/null
+++ b/piflow-bundle/src/main/resources/microorganism/down.json
@@ -0,0 +1,31 @@
+{
+ "flow":{
+ "name":"test",
+ "uuid":"1234",
+ "stops":[
+ {
+ "uuid":"1111",
+ "name":"LoadFromFtpToHDFS",
+ "bundle":"cn.piflow.bundle.ftp.LoadFromFtpToHDFS",
+ "properties":{
+ "url_str":"ftp.ebi.ac.uk",
+ "port":"",
+ "username":"",
+ "password":"",
+ "ftpFile":"/pub/databases/ena/sequence/release/con/rel_con_env_07_r138.dat.gz",
+ "HDFSUrl":"hdfs://10.0.88.70:9000",
+ "HDFSPath":"/yqd/weishengwu/embl/",
+ "isFile":"true"
+ }
+ }
+ ],
+ "paths":[
+ {
+ "from":"",
+ "outport":"",
+ "inport":"",
+ "to":""
+ }
+ ]
+ }
+}
\ No newline at end of file
diff --git a/piflow-bundle/src/main/resources/microorganism/embl_parser.json b/piflow-bundle/src/main/resources/microorganism/embl_parser.json
new file mode 100644
index 0000000..dc40445
--- /dev/null
+++ b/piflow-bundle/src/main/resources/microorganism/embl_parser.json
@@ -0,0 +1,67 @@
+{
+ "flow":{
+ "name":"test",
+ "uuid":"1234",
+ "stops":[
+
+ {
+ "uuid":"1111",
+ "name":"SelectFilesByName",
+ "bundle":"cn.piflow.bundle.ftp.SelectFilesByName",
+ "properties":{
+ "HDFSUrl":"hdfs://10.0.88.70:9000",
+ "HDFSPath":"/yqd/weishengwu/embl",
+ "selectionConditions":".*con_pro_02_r138.dat.gz,.*con_vrl_01_r138.dat.gz,.*pat_phg_01_r138.dat.gz"
+ }
+ },{
+ "uuid":"2222",
+ "name":"UnzipFilesOnHDFS_1",
+ "bundle":"cn.piflow.bundle.http.UnzipFilesOnHDFS_1",
+ "properties":{
+ "isCustomize":"false",
+ "filePath":"",
+ "fileType":"gz",
+ "unzipPath":""
+
+ }
+ },
+ {
+ "uuid":"3333",
+ "name":"EmblParser",
+ "bundle":"cn.piflow.bundle.microorganism.EmblParser",
+ "properties":{
+ }
+ },{
+ "uuid":"4444",
+ "name":"PutEs",
+ "bundle":"cn.piflow.bundle.es.PutEs",
+ "properties":{
+ "es_nodes": "10.0.88.70,10.0.88.71,10.0.88.72",
+ "port": "9200",
+ "es_index": "embl",
+ "es_type": "embl"
+ }
+ }
+ ],
+ "paths":[
+ {
+ "from":"SelectFilesByName",
+ "outport":"",
+ "inport":"",
+ "to":"UnzipFilesOnHDFS_1"
+ },
+ {
+ "from":"UnzipFilesOnHDFS_1",
+ "outport":"",
+ "inport":"",
+ "to":"EmblParser"
+ },
+ {
+ "from":"EmblParser",
+ "outport":"",
+ "inport":"",
+ "to":"PutEs"
+ }
+ ]
+ }
+}
\ No newline at end of file
diff --git a/piflow-bundle/src/main/resources/microorganism/refseq.png b/piflow-bundle/src/main/resources/microorganism/refseq.png
new file mode 100644
index 0000000..5448ab9
Binary files /dev/null and b/piflow-bundle/src/main/resources/microorganism/refseq.png differ
diff --git a/piflow-bundle/src/main/resources/microorganism/refseq_genome.json b/piflow-bundle/src/main/resources/microorganism/refseq_genome.json
new file mode 100644
index 0000000..98f997a
--- /dev/null
+++ b/piflow-bundle/src/main/resources/microorganism/refseq_genome.json
@@ -0,0 +1,67 @@
+{
+ "flow":{
+ "name":"test",
+ "uuid":"1234",
+ "stops":[
+
+ {
+ "uuid":"1111",
+ "name":"SelectFilesByName",
+ "bundle":"cn.piflow.bundle.ftp.SelectFilesByName",
+ "properties":{
+ "HDFSUrl":"hdfs://10.0.88.70:9000",
+ "HDFSPath":"/yqd/weishengwu/refseq/",
+ "selectionConditions":".*genomic.gbff.gz"
+ }
+ },{
+ "uuid":"2222",
+ "name":"UnzipFilesOnHDFS_1",
+ "bundle":"cn.piflow.bundle.http.UnzipFilesOnHDFS_1",
+ "properties":{
+ "isCustomize":"false",
+ "filePath":"",
+ "fileType":"gz",
+ "unzipPath":""
+
+ }
+ },
+ {
+ "uuid":"3333",
+ "name":"Refseq_genomeParser",
+ "bundle":"cn.piflow.bundle.microorganism.Refseq_genomeParser",
+ "properties":{
+ }
+ },{
+ "uuid":"4444",
+ "name":"PutEs",
+ "bundle":"cn.piflow.bundle.es.PutEs",
+ "properties":{
+ "es_nodes": "10.0.88.70,10.0.88.71,10.0.88.72",
+ "port": "9200",
+ "es_index": "genome",
+ "es_type": "archaea"
+ }
+ }
+ ],
+ "paths":[
+ {
+ "from":"SelectFilesByName",
+ "outport":"",
+ "inport":"",
+ "to":"UnzipFilesOnHDFS_1"
+ },
+ {
+ "from":"UnzipFilesOnHDFS_1",
+ "outport":"",
+ "inport":"",
+ "to":"Refseq_genomeParser"
+ },
+ {
+ "from":"Refseq_genomeParser",
+ "outport":"",
+ "inport":"",
+ "to":"PutEs"
+ }
+ ]
+ }
+}
\ No newline at end of file
diff --git a/piflow-bundle/src/main/resources/microorganism/select_unzip.json b/piflow-bundle/src/main/resources/microorganism/select_unzip.json
new file mode 100644
index 0000000..29c65d2
--- /dev/null
+++ b/piflow-bundle/src/main/resources/microorganism/select_unzip.json
@@ -0,0 +1,37 @@
+{
+ "flow":{
+ "name":"test",
+ "uuid":"1234",
+ "stops":[
+ {
+ "uuid":"0000",
+ "name":"SelectFilesByName",
+ "bundle":"cn.piflow.bundle.ftp.SelectFilesByName",
+ "properties":{
+ "HDFSUrl":"hdfs://10.0.88.70:9000",
+ "HDFSPath":"/yqd/",
+ "selectionConditions":".*genomic.gbff.gz"
+ }
+ },{
+ "uuid":"1111",
+ "name":"UnzipFilesOnHDFS_1",
+ "bundle":"cn.piflow.bundle.http.UnzipFilesOnHDFS_1",
+ "properties":{
+ "isCustomize":"true",
+ "filePath":"hdfs://10.0.88.70:9000/yqd/archaea.1.genomic.gbff.gz",
+ "fileType":"gz",
+ "unzipPath":"hdfs://10.0.88.70:9000/yqd/weishengwu/"
+
+ }
+ }
+ ],
+ "paths":[
+ {
+ "from":"SelectFilesByName",
+ "outport":"",
+ "inport":"",
+ "to":"UnzipFilesOnHDFS_1"
+ }
+ ]
+ }
+}
\ No newline at end of file
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/ftp/LoadFromFtpToHDFS.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/ftp/LoadFromFtpToHDFS.scala
new file mode 100644
index 0000000..211ba74
--- /dev/null
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/ftp/LoadFromFtpToHDFS.scala
@@ -0,0 +1,141 @@
+package cn.piflow.bundle.ftp
+
+import cn.piflow.conf.bean.PropertyDescriptor
+import cn.piflow.conf.util.{ImageUtil, MapUtil}
+import cn.piflow.conf.{ConfigurableStop, PortEnum}
+import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
+import org.apache.commons.net.ftp.{FTP, FTPClient, FTPClientConfig, FTPFile}
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{FSDataOutputStream, FileSystem, Path}
+
+class LoadFromFtpToHDFS extends ConfigurableStop {
+ override val authorEmail: String = "yangqidong@cnic.cn"
+ override val description: String = "Load file from ftp server save on HDFS"
+ override val inportList: List[String] = List(PortEnum.NonePort.toString)
+ override val outportList: List[String] = List(PortEnum.NonePort.toString)
+
+ var url_str:String =_
+ var port:String=_
+ var username:String=_
+ var password:String=_
+ var ftpFile:String=_
+ var HDFSUrl:String=_
+ var HDFSPath:String=_
+ var isFile:String=_
+
+ var fs: FileSystem=null
+ var con: FTPClientConfig =null
+
+ def downFile(ftp: FTPClient,ftpFilePath:String,HDFSSavePath:String): Unit = {
+
+ val changeFlag: Boolean = ftp.changeWorkingDirectory(ftpFilePath)
+ val files: Array[FTPFile] = ftp.listFiles()
+ for(x <- files ) {
+ if (x.isFile) {
+ println("down start ^^^ "+x.getName)
+ val hdfsPath: Path = new Path(HDFSSavePath + x.getName)
+ if(! fs.exists(hdfsPath)){
+ var fdos: FSDataOutputStream = fs.create(hdfsPath)
+ ftp.retrieveFile(new String(x.getName.getBytes("GBK"),"ISO-8859-1"), fdos)
+ fdos.close()
+ }
+ } else {
+ downFile(ftp,ftpFilePath+x.getName+"/",HDFSSavePath+x.getName+"/")
+ }
+ }
+
+ }
+
+
+ override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
+
+ val configuration: Configuration = new Configuration()
+ configuration.set("fs.defaultFS", HDFSUrl)
+ fs = FileSystem.get(configuration)
+
+ val ftp:FTPClient = openFtpClient()
+
+ if(isFile.equals("true")){
+ val pathArr: Array[String] = ftpFile.split("/")
+ var dirPath:String=""
+ for(x <- (0 until pathArr.length-1)){
+ dirPath += (pathArr(x)+"/")
+ }
+ ftp.changeWorkingDirectory(dirPath)
+
+ var fdos: FSDataOutputStream = fs.create(new Path(HDFSPath+pathArr.last))
+ ftp.retrieveFile(new String(pathArr.last.getBytes("GBK"),"ISO-8859-1"), fdos)
+ fdos.flush()
+ fdos.close()
+ }else{
+ downFile(ftp,ftpFile,HDFSPath)
+ }
+ }
+
+ def openFtpClient(): FTPClient = {
+ val ftp = new FTPClient
+ if(port.length > 0 ){
+ ftp.connect(url_str,port.toInt)
+ }else{
+ ftp.connect(url_str)
+ }
+ if(username.length > 0 && password.length > 0){
+ ftp.login(username,password)
+ }else{
+ ftp.login("anonymous", "121@hotmail.com")
+ }
+ ftp.setControlEncoding("GBK")
+ con = new FTPClientConfig(FTPClientConfig.SYST_NT)
+ con.setServerLanguageCode("zh")
+ ftp.setFileType(FTP.BINARY_FILE_TYPE)
+ ftp
+ }
+
+
+ override def setProperties(map: Map[String, Any]): Unit = {
+ url_str=MapUtil.get(map,key="url_str").asInstanceOf[String]
+ port=MapUtil.get(map,key="port").asInstanceOf[String]
+ username=MapUtil.get(map,key="username").asInstanceOf[String]
+ password=MapUtil.get(map,key="password").asInstanceOf[String]
+ ftpFile=MapUtil.get(map,key="ftpFile").asInstanceOf[String]
+ HDFSUrl=MapUtil.get(map,key="HDFSUrl").asInstanceOf[String]
+ HDFSPath=MapUtil.get(map,key="HDFSPath").asInstanceOf[String]
+ isFile=MapUtil.get(map,key="isFile").asInstanceOf[String]
+ }
+
+
+ override def getPropertyDescriptor(): List[PropertyDescriptor] = {
+ var descriptor : List[PropertyDescriptor] = List()
+ val url_str = new PropertyDescriptor().name("url_str").displayName("URL").defaultValue("IP of FTP server, such as 128.136.0.1 or ftp.ei.addfc.gak").required(true)
+ val port = new PropertyDescriptor().name("port").displayName("PORT").defaultValue("Port of FTP server").required(false)
+ val username = new PropertyDescriptor().name("username").displayName("USER_NAME").defaultValue("").required(false)
+ val password = new PropertyDescriptor().name("password").displayName("PASSWORD").defaultValue("").required(false)
+ val ftpFile = new PropertyDescriptor().name("ftpFile").displayName("FTP_File").defaultValue("The path of the file to the FTP server, such as /test/Ab/ or /test/Ab/test.txt").required(true)
+ val HDFSUrl = new PropertyDescriptor().name("HDFSUrl").displayName("HDFSUrl").defaultValue("The URL of the HDFS file system, such as hdfs://10.0.88.70:9000").required(true)
+ val HDFSPath = new PropertyDescriptor().name("HDFSPath").displayName("HDFSPath").defaultValue("The save path of the HDFS file system, such as /test/Ab/").required(true)
+ val isFile = new PropertyDescriptor().name("isFile").displayName("isFile").defaultValue("Whether the path is a file or not, if true is filled in, only a single file specified by the path is downloaded. If false is filled in, all files under the folder are downloaded recursively.").required(true)
+ descriptor = isFile :: descriptor
+ descriptor = url_str :: descriptor
+ descriptor = port :: descriptor
+ descriptor = username :: descriptor
+ descriptor = password :: descriptor
+ descriptor = ftpFile :: descriptor
+ descriptor = HDFSUrl :: descriptor
+ descriptor = HDFSPath :: descriptor
+ descriptor
+}
+
+ override def getIcon(): Array[Byte] = {
+ ImageUtil.getImage("ftp.png")
+ }
+
+ override def getGroup(): List[String] = {
+ List(StopGroupEnum.FtpGroup.toString)
+ }
+
+ override def initialize(ctx: ProcessContext): Unit = {
+
+ }
+
+
+}
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/ftp/SelectFilesByName.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/ftp/SelectFilesByName.scala
new file mode 100644
index 0000000..d85a045
--- /dev/null
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/ftp/SelectFilesByName.scala
@@ -0,0 +1,109 @@
+package cn.piflow.bundle.ftp
+
+import java.util.regex.Pattern
+
+import cn.piflow.conf.bean.PropertyDescriptor
+import cn.piflow.conf.util.{ImageUtil, MapUtil}
+import cn.piflow.conf.{ConfigurableStop, PortEnum}
+import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.types.{StringType, StructField, StructType}
+import org.apache.spark.sql.{DataFrame, Row, SparkSession}
+
+import scala.collection.mutable.ArrayBuffer
+
+class SelectFilesByName extends ConfigurableStop{
+ override val authorEmail: String = "yangqidong@cnic.cn"
+ override val description: String = "Selecting files by file name"
+ override val inportList: List[String] = List(PortEnum.NonePort.toString)
+ override val outportList: List[String] = List(PortEnum.DefaultPort.toString)
+
+ var HDFSUrl:String=_
+ var HDFSPath:String=_
+ var selectionConditions:String =_
+
+ var fs: FileSystem=null
+ var pathARR:ArrayBuffer[String]=ArrayBuffer()
+ var selectArr:Array[String]=null
+
+ def selectFile(path: String): Unit = {
+ val statusesARR: Array[FileStatus] = fs.listStatus(new Path(path))
+ for(each <- statusesARR){
+ val pathStr = each.getPath.toString
+ if(each.isFile){
+ val fileName: String = pathStr.split("/").last
+ selectArr = selectionConditions.split(",")
+ var b: Boolean =false
+ for(x <- selectArr){
+ b = Pattern.matches(x,fileName)
+ if(b){
+ pathARR += pathStr
+ }
+ }
+ }else{
+ selectFile(pathStr)
+ }
+ }
+ }
+
+ override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
+
+ val session: SparkSession = pec.get[SparkSession]()
+
+ val configuration: Configuration = new Configuration()
+ configuration.set("fs.defaultFS", HDFSUrl)
+ fs = FileSystem.get(configuration)
+
+ selectFile(HDFSPath)
+
+ val rows: List[Row] = pathARR.map(each => {
+ var arr:Array[String]=Array(each)
+ val row: Row = Row.fromSeq(arr)
+ row
+ }).toList
+ val rowRDD: RDD[Row] = session.sparkContext.makeRDD(rows)
+ val fields: Array[StructField] = "path".split("/").map(d=>StructField(d,StringType,nullable = true))
+ val schema: StructType = StructType(fields)
+ val df: DataFrame = session.createDataFrame(rowRDD,schema)
+
+
+ println("#################################################")
+ df.show(20)
+ println("#################################################")
+
+ out.write(df)
+ }
+
+ override def setProperties(map: Map[String, Any]): Unit = {
+ HDFSUrl=MapUtil.get(map,key="HDFSUrl").asInstanceOf[String]
+ HDFSPath=MapUtil.get(map,key="HDFSPath").asInstanceOf[String]
+ selectionConditions=MapUtil.get(map,key="selectionConditions").asInstanceOf[String]
+ }
+
+
+ override def getPropertyDescriptor(): List[PropertyDescriptor] = {
+ var descriptor : List[PropertyDescriptor] = List()
+ val HDFSUrl = new PropertyDescriptor().name("HDFSUrl").displayName("HDFSUrl").defaultValue("The URL of the HDFS file system, such as hdfs://10.0.88.70:9000").required(true)
+ val HDFSPath = new PropertyDescriptor().name("HDFSPath").displayName("HDFSPath").defaultValue("The save path of the HDFS file system, such as /test/Ab").required(true)
+ val selectionConditions = new PropertyDescriptor().name("selectionConditions").displayName("selectionConditions").defaultValue("To select conditions, you need to fill in regular expressions in java, such as. * abc. *").required(true)
+ descriptor = HDFSUrl :: descriptor
+ descriptor = HDFSPath :: descriptor
+ descriptor = selectionConditions :: descriptor
+ descriptor
+ }
+
+ override def getIcon(): Array[Byte] = {
+ ImageUtil.getImage("ftp.png")
+ }
+
+ override def getGroup(): List[String] = {
+ List(StopGroupEnum.FtpGroup.toString)
+ }
+
+ override def initialize(ctx: ProcessContext): Unit = {
+
+ }
+
+}
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/http/UnzipFilesOnHDFS.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/http/UnzipFilesOnHDFS.scala
index 85d0eac..e3f7719 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/http/UnzipFilesOnHDFS.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/http/UnzipFilesOnHDFS.scala
@@ -12,178 +12,122 @@ import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
+import scala.collection.mutable.ArrayBuffer
+
class UnzipFilesOnHDFS extends ConfigurableStop {
val authorEmail: String = "yangqidong@cnic.cn"
val description: String = "Unzip files on HDFS"
- val inportList: List[String] = List(PortEnum.NonePort.toString)
+ val inportList: List[String] = List(PortEnum.DefaultPort.toString)
val outportList: List[String] = List(PortEnum.DefaultPort.toString)
+ var isCustomize:String=_
var filePath:String=_
var fileType:String=_
var unzipPath:String=_
- def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
- val session: SparkSession = pec.get[SparkSession]()
+ var session: SparkSession = null
+
+ def unzipFile(hdfsFilePath: String, zipFileType: String, unzipHdfsPath: String):String = {
+ var zft: String = ""
+ if(zipFileType.length < 1){
+ zft = hdfsFilePath.split("\\.").last
+ }else{
+ zft = zipFileType
+ }
val configuration: Configuration = new Configuration()
- val pathARR: Array[String] = filePath.split("\\/")
+ val pathARR: Array[String] = hdfsFilePath.split("\\/")
var hdfsUrl:String=""
for (x <- (0 until 3)){
hdfsUrl+=(pathARR(x) +"/")
}
configuration.set("fs.defaultFS",hdfsUrl)
- // configuration.set("dfs.nameservices", "nameservice1")
- // configuration.set("dfs.ha.namenodes.nameservice1", "nn1,nn2");
- // configuration.set("dfs.namenode.rpc-address.nameservice1.nn1", "xxx:8020");
- // configuration.set("dfs.namenode.rpc-address.nameservice1.nn2", "xxx:8020");
- // configuration.set("dfs.client.failover.proxy.provider.nameservice1"
- // ,"org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider");
- // configuration.addResource("classpath:/hadoop/core-site.xml");
- // configuration.addResource("classpath:/hadoop/hdfs-site.xml");
- // configuration.addResource("classpath:/hadoop/mapred-site.xml");
+ var uhp : String=""
+ if(unzipHdfsPath.length < 1){
+ for (x <- (0 until pathARR.length-1)){
+ uhp+=(pathARR(x) +"/")
+ }
+ }else{
+ uhp=unzipHdfsPath
+ }
val fs = FileSystem.get(configuration)
- val fdis: FSDataInputStream = fs.open(new Path(filePath))
-
-
- val filePathArr: Array[String] = filePath.split("/")
+ val fdis: FSDataInputStream = fs.open(new Path(hdfsFilePath))
+ val filePathArr: Array[String] = hdfsFilePath.split("/")
var fileName: String = filePathArr.last
if(fileName.length == 0){
fileName = filePathArr(filePathArr.size-2)
}
- if(fileType.equals("gz")){
+ var savePath:String=""
+ if(zft.equals("gz")){
val gzip: GZIPInputStream = new GZIPInputStream(fdis)
var n = -1
val buf=new Array[Byte](10*1024*1024)
- val savePath = new Path(unzipPath +fileName.replace(".gz",""))
- val fdos = fs.create(savePath)
+ savePath = uhp +fileName.replace(".gz","")
+ val path = new Path(savePath)
+ val fdos = fs.create(path)
while((n=gzip.read(buf)) != -1 && n != -1){
fdos.write(buf,0,n)
fdos.flush()
}
-
-
- }/*else if(fileType.equals("tar")){
-
- var entryNum:Int=0
- var entryFileName:String=null
- var entryFile:File=null
- var subEntryFile:File=null
- var subEntryFileName:String=null
- var tarArchiveEntries:Array[TarArchiveEntry]=null
- var fileList:List[String]=List()
- var fos:FileOutputStream=null
-
- var entry: TarArchiveEntry = null
- val tarIs: TarArchiveInputStream = new TarArchiveInputStream(fdis)
- while ((entry = tarIs.getNextTarEntry) != null && entry != null) {
- entryFileName= localPath +File.separator+entry.getName()
- entryFile=new File(entryFileName)
- entryNum += 1
- if(entry.isDirectory()){
- if(!entryFile.exists()){
- entryFile.mkdirs()
- }
- tarArchiveEntries=entry.getDirectoryEntries()
- for(i<-0 until tarArchiveEntries.length){
- subEntryFileName=entryFileName+File.separator+tarArchiveEntries(i).getName()
- subEntryFile=new File(subEntryFileName)
- fileList=subEntryFileName::fileList
- fos=new FileOutputStream(subEntryFile)
- var mark = -1
- val buf=new Array[Byte](4*1024)
- while((mark=tarIs.read(buf)) != -1 && mark != -1){
- fos.write(buf,0,mark)
- }
- fos.close()
- fos=null
- }
- }else{
- fileList = entryFileName :: fileList
- fos=new FileOutputStream(entryFile)
- var mark = -1
- val buf=new Array[Byte](4*1024)
- while((mark=tarIs.read(buf)) != -1 && mark != -1){
- fos.write(buf,0,mark)
- }
- fos.close()
- fos=null
- }
-
- }
- if(entryNum==0){
- println("there is no file!")
- }
-
- }else if(fileType.equals("tar.gz")){
-
- var entryNum:Int=0
- var entryFileName:String=null
- var entryFile:File=null
- var subEntryFile:File=null
- var subEntryFileName:String=null
- var tarArchiveEntries:Array[TarArchiveEntry]=null
- var fileList:List[String]=List()
- var fos:FileOutputStream=null
-
- var entry: TarArchiveEntry = null
- val gzip:GZIPInputStream=new GZIPInputStream(fdis)
- val tarIs: TarArchiveInputStream = new TarArchiveInputStream(gzip)
- while ((entry = tarIs.getNextTarEntry) != null && entry != null) {
- entryFileName=localPath +File.separator+entry.getName()
- entryFile=new File(entryFileName)
- entryNum += 1
- if(entry.isDirectory()){
- if(!entryFile.exists()){
- entryFile.mkdirs()
- }
- tarArchiveEntries=entry.getDirectoryEntries()
- for(i<-0 until tarArchiveEntries.length){
- subEntryFileName=entryFileName+File.separator+tarArchiveEntries(i).getName()
- subEntryFile=new File(subEntryFileName)
- fileList=subEntryFileName::fileList
- fos=new FileOutputStream(subEntryFile)
- var mark = -1
- val buf=new Array[Byte](4*1024)
- while((mark=tarIs.read(buf)) != -1 && mark != -1){
- fos.write(buf,0,mark)
- }
- fos.close()
- fos=null
- }
- }else{
- fileList = entryFileName :: fileList
- fos=new FileOutputStream(entryFile)
- var mark = -1
- val buf=new Array[Byte](4*1024)
- while((mark=tarIs.read(buf)) != -1 && mark != -1){
- fos.write(buf,0,mark)
- }
- fos.close()
- fos=null
- }
-
- }
- if(entryNum==0){
- println("there is no file!")
- }
- }*/else{
+ fdos.close()
+ gzip.close()
+ fdis.close()
+ }else{
throw new RuntimeException("File type fill in error, or do not support this type.")
}
- var seq:Seq[String]=Seq(unzipPath)
- val row: Row = Row.fromSeq(seq)
- val list:List[Row]=List(row)
- val rdd: RDD[Row] = session.sparkContext.makeRDD(list)
+ savePath
+
+ }
+
+ def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
+
+ session = pec.get[SparkSession]()
+
+ var savePath: String = ""
+ var arr:ArrayBuffer[Row]=ArrayBuffer()
+
+
+ if(isCustomize.equals("true")){
+ println("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
+
+ savePath = unzipFile(filePath,fileType,unzipPath)
+
+
+ println("savepath : "+savePath)
+
+ arr += Row.fromSeq(Array(savePath))
+
+ }else if (isCustomize.equals("false")){
+
+ val inDf: DataFrame = in.read()
+ inDf.collect().foreach(row => {
+
+ filePath = row.get(0).asInstanceOf[String]
+ savePath = unzipFile(filePath,"","")
+ arr += Row.fromSeq(Array(savePath))
+ savePath = ""
+
+ })
+
+ }
+
+ val rdd: RDD[Row] = session.sparkContext.makeRDD(arr.toList)
val fields: Array[StructField] =Array(StructField("unzipPath",StringType,nullable = true))
val schema: StructType = StructType(fields)
val df: DataFrame = session.createDataFrame(rdd,schema)
+ println("##################################################################################################")
+// println(df.count())
+ df.show(20)
+ println("##################################################################################################")
+
out.write(df)
}
@@ -193,6 +137,7 @@ class UnzipFilesOnHDFS extends ConfigurableStop {
}
def setProperties(map : Map[String, Any]) = {
+ isCustomize=MapUtil.get(map,key="isCustomize").asInstanceOf[String]
filePath=MapUtil.get(map,key="filePath").asInstanceOf[String]
fileType=MapUtil.get(map,key="fileType").asInstanceOf[String]
unzipPath=MapUtil.get(map,key="unzipPath").asInstanceOf[String]
@@ -201,9 +146,15 @@ class UnzipFilesOnHDFS extends ConfigurableStop {
override def getPropertyDescriptor(): List[PropertyDescriptor] = {
var descriptor : List[PropertyDescriptor] = List()
- val filePath = new PropertyDescriptor().name("filePath").displayName("filePath").description("file path,such as hdfs://10.0.86.89:9000/a/a.gz").defaultValue("").required(true)
- val fileType = new PropertyDescriptor().name("fileType").displayName("fileType").description("file type,such as gz").defaultValue("").required(true)
+ val filePath = new PropertyDescriptor().name("filePath").displayName("filePath").description("file path,such as hdfs://10.0.86.89:9000/a/a.gz").defaultValue("").required(false)
+ val fileType = new PropertyDescriptor().name("fileType").displayName("fileType").description("file type,such as gz").defaultValue("").required(false)
val unzipPath = new PropertyDescriptor().name("unzipPath").displayName("unzipPath").description("unzip path, such as hdfs://10.0.86.89:9000/b/").defaultValue("").required(true)
+ val isCustomize = new PropertyDescriptor().name("isCustomize").displayName("isCustomize").description("Whether to customize the compressed file path, if true, " +
+ "you must specify the path where the compressed file is located and the saved path after decompression. " +
+ "If it is fals, it will automatically find the file path data from the upstream port and " +
+ "save it to the original folder after decompression.")
+ .defaultValue("").required(false)
+ descriptor = isCustomize :: descriptor
descriptor = filePath :: descriptor
descriptor = fileType :: descriptor
descriptor = unzipPath :: descriptor
@@ -216,7 +167,7 @@ class UnzipFilesOnHDFS extends ConfigurableStop {
}
override def getGroup(): List[String] = {
- List(StopGroup.HttpGroup.toString)
+ List(StopGroupEnum.HttpGroup.toString)
}
}
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/EmblParser.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/EmblParser.scala
new file mode 100644
index 0000000..f1adbe5
--- /dev/null
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/EmblParser.scala
@@ -0,0 +1,168 @@
+package cn.piflow.bundle.microorganism
+
+import java.io._
+
+import cn.piflow.bundle.microorganism.util.{CustomIOTools, Process}
+import cn.piflow.conf.bean.PropertyDescriptor
+import cn.piflow.conf.util.ImageUtil
+import cn.piflow.conf.{ConfigurableStop, PortEnum}
+import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{FSDataInputStream, FSDataOutputStream, FileSystem, Path}
+import org.apache.spark.sql.{DataFrame, SparkSession}
+import org.biojavax.bio.seq.{RichSequence, RichSequenceIterator}
+import org.json.JSONObject
+
+class EmblParser extends ConfigurableStop{
+ override val authorEmail: String = "yangqidong@cnic.cn"
+ override val description: String = "Parsing EMBL type data"
+ override val inportList: List[String] =List(PortEnum.DefaultPort.toString)
+ override val outportList: List[String] = List(PortEnum.DefaultPort.toString)
+
+
+
+ override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
+
+ val session = pec.get[SparkSession]()
+
+ val inDf: DataFrame = in.read()
+ val configuration: Configuration = new Configuration()
+
+ var pathStr: String =inDf.take(1)(0).get(0).asInstanceOf[String]
+ val pathARR: Array[String] = pathStr.split("\\/")
+ var hdfsUrl:String=""
+ for (x <- (0 until 3)){
+ hdfsUrl+=(pathARR(x) +"/")
+ }
+ configuration.set("fs.defaultFS",hdfsUrl)
+ var fs: FileSystem = FileSystem.get(configuration)
+
+ val hdfsPathTemporary:String = hdfsUrl+"/Refseq_genomeParser_temporary.json"
+ val path: Path = new Path(hdfsPathTemporary)
+
+ if(fs.exists(path)){
+ fs.delete(path)
+ }
+
+ fs.create(path).close()
+ var fdos: FSDataOutputStream = fs.append(path)
+
+ var jsonStr: String =""
+
+ var bis: BufferedInputStream =null
+
+ // var df: DataFrame =null
+ // var d: DataFrame =null
+ // var jsonRDD: RDD[String] =null
+
+ inDf.collect().foreach(row => {
+
+ var n : Int =0
+ pathStr = row.get(0).asInstanceOf[String]
+
+ println("#############################################")
+ println("start parser ^^^" + pathStr)
+ println("#############################################")
+
+// if(pathStr.equals("hdfs://10.0.88.70:9000/yqd/weishengwu/refseq/bacteria.1.genomic.gbff")) {
+
+
+ var fdis: FSDataInputStream = fs.open(new Path(pathStr))
+ // var fdis: FSDataInputStream = fs.open(new Path("hdfs://10.0.88.70:9000/yqd/weishengwu/refseq/bacteria.1.1.genomic.fna.gz"))
+
+ // var gzipout: GZIPInputStream = new GZIPInputStream(fdis)
+
+ // var br: BufferedReader = new BufferedReader(new InputStreamReader(gzipout))
+
+ var br: BufferedReader = new BufferedReader(new InputStreamReader(fdis))
+
+ var sequences: RichSequenceIterator = CustomIOTools.IOTools.readEMBLDNA (br, null)
+
+ while (sequences.hasNext) {
+ n += 1
+ var seq: RichSequence = sequences.nextRichSequence()
+ var doc: JSONObject = new JSONObject
+ Process.processEMBL_EnsemblSeq(seq, doc)
+ jsonStr = doc.toString
+ println("start " + n)
+
+ if (n == 1) {
+ bis = new BufferedInputStream(new ByteArrayInputStream(("[" + jsonStr).getBytes()))
+ } else {
+ bis = new BufferedInputStream(new ByteArrayInputStream(("," + jsonStr).getBytes()))
+ }
+
+ val buff: Array[Byte] = new Array[Byte](1048576)
+
+ var count: Int = bis.read(buff)
+ while (count != -1) {
+ fdos.write(buff, 0, count)
+ fdos.flush()
+ count = bis.read(buff)
+ }
+
+ /* if(n==1){
+ jsonRDD = session.sparkContext.makeRDD(jsonStr :: Nil)
+ df = session.read.json(jsonRDD)
+ }else{
+ jsonRDD = session.sparkContext.makeRDD(jsonStr :: Nil)
+ d = session.read.json(jsonRDD)
+ df = df.union(d.toDF(df.columns:_*))
+ }*/
+
+ fdos.flush()
+ bis = null
+ seq = null
+ doc = null
+ // jsonRDD = null
+ // d = null
+ }
+ bis = new BufferedInputStream(new ByteArrayInputStream(("]").getBytes()))
+ val buff: Array[Byte] = new Array[Byte](1048576)
+
+ var count: Int = bis.read(buff)
+ while (count != -1) {
+ fdos.write(buff, 0, count)
+ fdos.flush()
+ count = bis.read(buff)
+ }
+ fdos.flush()
+// }
+ })
+
+ fdos.close()
+
+ println("start parser HDFSjsonFile")
+ val df: DataFrame = session.read.json(hdfsPathTemporary)
+
+ println("############################################################")
+ // println(df.count())
+ df.show(20)
+ println("############################################################")
+ out.write(df)
+
+
+ }
+
+ override def setProperties(map: Map[String, Any]): Unit = {
+
+ }
+
+ override def getPropertyDescriptor(): List[PropertyDescriptor] ={
+ var descriptor : List[PropertyDescriptor] = List()
+ descriptor
+ }
+
+ override def getIcon(): Array[Byte] = {
+ ImageUtil.getImage("/microorganism/EMBL_Logo.svg")
+ }
+
+ override def getGroup(): List[String] = {
+ List(StopGroupEnum.MicroorganismGroup.toString)
+ }
+
+ override def initialize(ctx: ProcessContext): Unit = {
+
+ }
+
+}
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/RefseqParser.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/RefseqParser.scala
new file mode 100644
index 0000000..c82eb54
--- /dev/null
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/RefseqParser.scala
@@ -0,0 +1,168 @@
+package cn.piflow.bundle.microorganism
+
+import java.io._
+
+import cn.piflow.bundle.microorganism.util.{CustomIOTools, Process}
+import cn.piflow.conf.bean.PropertyDescriptor
+import cn.piflow.conf.util.ImageUtil
+import cn.piflow.conf.{ConfigurableStop, PortEnum}
+import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{FSDataInputStream, FSDataOutputStream, FileSystem, Path}
+import org.apache.spark.sql.{DataFrame, SparkSession}
+import org.biojavax.bio.seq.{RichSequence, RichSequenceIterator}
+import org.json.JSONObject
+
+class RefseqParser extends ConfigurableStop{
+ override val authorEmail: String = "yangqidong@cnic.cn"
+ override val description: String = "Parsing Refseq_genome type data"
+ override val inportList: List[String] =List(PortEnum.DefaultPort.toString)
+ override val outportList: List[String] = List(PortEnum.DefaultPort.toString)
+
+
+
+ override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
+
+ val session = pec.get[SparkSession]()
+
+ val inDf: DataFrame = in.read()
+ val configuration: Configuration = new Configuration()
+
+ var pathStr: String =inDf.take(1)(0).get(0).asInstanceOf[String]
+ val pathARR: Array[String] = pathStr.split("\\/")
+ var hdfsUrl:String=""
+ for (x <- (0 until 3)){
+ hdfsUrl+=(pathARR(x) +"/")
+ }
+ configuration.set("fs.defaultFS",hdfsUrl)
+ var fs: FileSystem = FileSystem.get(configuration)
+
+ val hdfsPathTemporary:String = hdfsUrl+"/Refseq_genomeParser_temporary.json"
+ val path: Path = new Path(hdfsPathTemporary)
+
+ if(fs.exists(path)){
+ fs.delete(path)
+ }
+
+ fs.create(path).close()
+ var fdos: FSDataOutputStream = fs.append(path)
+
+ var jsonStr: String =""
+
+ var bis: BufferedInputStream =null
+
+// var df: DataFrame =null
+// var d: DataFrame =null
+// var jsonRDD: RDD[String] =null
+
+ inDf.collect().foreach(row => {
+
+ var n : Int =0
+ pathStr = row.get(0).asInstanceOf[String]
+
+ println("#############################################")
+ println("start parser ^^^" + pathStr)
+ println("#############################################")
+
+// if(pathStr.equals("hdfs://10.0.88.70:9000/yqd/weishengwu/refseq/bacteria.1.genomic.gbff")) {
+
+
+ var fdis: FSDataInputStream = fs.open(new Path(pathStr))
+// var fdis: FSDataInputStream = fs.open(new Path("hdfs://10.0.88.70:9000/yqd/weishengwu/refseq/bacteria.1.1.genomic.fna.gz"))
+
+// var gzipout: GZIPInputStream = new GZIPInputStream(fdis)
+
+// var br: BufferedReader = new BufferedReader(new InputStreamReader(gzipout))
+
+ var br: BufferedReader = new BufferedReader(new InputStreamReader(fdis))
+
+ var sequences: RichSequenceIterator = CustomIOTools.IOTools.readGenbankProtein(br, null)
+
+ while (sequences.hasNext) {
+ n += 1
+ var seq: RichSequence = sequences.nextRichSequence()
+ var doc: JSONObject = new JSONObject
+ Process.processSingleSequence(seq, doc)
+ jsonStr = doc.toString
+ println("start " + n)
+
+ if (n == 1) {
+ bis = new BufferedInputStream(new ByteArrayInputStream(("[" + jsonStr).getBytes()))
+ } else {
+ bis = new BufferedInputStream(new ByteArrayInputStream(("," + jsonStr).getBytes()))
+ }
+
+ val buff: Array[Byte] = new Array[Byte](1048576)
+
+ var count: Int = bis.read(buff)
+ while (count != -1) {
+ fdos.write(buff, 0, count)
+ fdos.flush()
+ count = bis.read(buff)
+ }
+
+ /* if(n==1){
+ jsonRDD = session.sparkContext.makeRDD(jsonStr :: Nil)
+ df = session.read.json(jsonRDD)
+ }else{
+ jsonRDD = session.sparkContext.makeRDD(jsonStr :: Nil)
+ d = session.read.json(jsonRDD)
+ df = df.union(d.toDF(df.columns:_*))
+ }*/
+
+ fdos.flush()
+ bis = null
+ seq = null
+ doc = null
+ // jsonRDD = null
+ // d = null
+ }
+ bis = new BufferedInputStream(new ByteArrayInputStream(("]").getBytes()))
+ val buff: Array[Byte] = new Array[Byte](1048576)
+
+ var count: Int = bis.read(buff)
+ while (count != -1) {
+ fdos.write(buff, 0, count)
+ fdos.flush()
+ count = bis.read(buff)
+ }
+ fdos.flush()
+// }
+ })
+
+ fdos.close()
+
+ println("start parser HDFSjsonFile")
+ val df: DataFrame = session.read.json(hdfsPathTemporary)
+
+ println("############################################################")
+// println(df.count())
+ df.show(20)
+ println("############################################################")
+ out.write(df)
+
+
+ }
+
+ override def setProperties(map: Map[String, Any]): Unit = {
+
+ }
+
+ override def getPropertyDescriptor(): List[PropertyDescriptor] ={
+ var descriptor : List[PropertyDescriptor] = List()
+ descriptor
+}
+
+ override def getIcon(): Array[Byte] = {
+ ImageUtil.getImage("/microorganism/refseq.png")
+ }
+
+ override def getGroup(): List[String] = {
+ List(StopGroupEnum.MicroorganismGroup.toString)
+ }
+
+ override def initialize(ctx: ProcessContext): Unit = {
+
+ }
+
+}
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/util/CustomEMBLFormat.java b/piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/util/CustomEMBLFormat.java
new file mode 100644
index 0000000..f4cd463
--- /dev/null
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/util/CustomEMBLFormat.java
@@ -0,0 +1,1151 @@
+package cn.piflow.bundle.microorganism.util;
+
+import org.biojava.bio.seq.Sequence;
+import org.biojava.bio.seq.io.ParseException;
+import org.biojava.bio.seq.io.SeqIOListener;
+import org.biojava.bio.seq.io.SymbolTokenization;
+import org.biojava.bio.symbol.IllegalSymbolException;
+import org.biojava.bio.symbol.SimpleSymbolList;
+import org.biojava.bio.symbol.Symbol;
+import org.biojava.bio.symbol.SymbolList;
+import org.biojava.utils.ChangeVetoException;
+import org.biojavax.*;
+import org.biojavax.bio.seq.MultiSourceCompoundRichLocation;
+import org.biojavax.bio.seq.RichFeature;
+import org.biojavax.bio.seq.RichLocation;
+import org.biojavax.bio.seq.RichSequence;
+import org.biojavax.bio.seq.io.GenbankLocationParser;
+import org.biojavax.bio.seq.io.RichSeqIOListener;
+import org.biojavax.bio.seq.io.RichSequenceFormat;
+import org.biojavax.bio.taxa.NCBITaxon;
+import org.biojavax.bio.taxa.SimpleNCBITaxon;
+import org.biojavax.ontology.ComparableTerm;
+import org.biojavax.utils.StringTools;
+
+import java.io.*;
+import java.util.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Created by xiujuan on 2016/1/27.
+ */
+public class CustomEMBLFormat extends RichSequenceFormat.HeaderlessFormat {
+ // Register this format with the format auto-guesser.
+ static {
+ RichSequence.IOTools.registerFormat(CustomEMBLFormat.class);
+ }
+
+ /**
+ * The name of the Pre-87 format
+ */
+ public static final String EMBL_PRE87_FORMAT = "EMBL_PRE87";
+
+ /**
+ * The name of the current format
+ */
+ public static final String EMBL_FORMAT = "EMBL";
+
+ protected static final String LOCUS_TAG = "ID";
+ protected static final String ACCESSION_TAG = "AC";
+ protected static final String VERSION_TAG = "SV";
+ protected static final String DEFINITION_TAG = "DE";
+ protected static final String DATE_TAG = "DT";
+ protected static final String DATABASE_XREF_TAG = "DR";
+ protected static final String SOURCE_TAG = "OS";
+ protected static final String ORGANISM_TAG = "OC";
+ protected static final String ORGANELLE_TAG = "OG";
+ protected static final String REFERENCE_TAG = "RN";
+ protected static final String REFERENCE_POSITION_TAG = "RP";
+ protected static final String REFERENCE_XREF_TAG = "RX";
+ protected static final String AUTHORS_TAG = "RA";
+ protected static final String CONSORTIUM_TAG = "RG";
+ protected static final String TITLE_TAG = "RT";
+ protected static final String LOCATOR_TAG = "RL";
+ protected static final String REMARK_TAG = "RC";
+ protected static final String KEYWORDS_TAG = "KW";
+ protected static final String COMMENT_TAG = "CC";
+ protected static final String FEATURE_HEADER_TAG = "FH";
+ protected static final String FEATURE_TAG = "FT";
+ protected static final String CONTIG_TAG = "CO";
+ protected static final String TPA_TAG = "AH";
+ protected static final String START_SEQUENCE_TAG = "SQ";
+ protected static final String DELIMITER_TAG = "XX";
+ protected static final String END_SEQUENCE_TAG = "//";
+
+ // the date pattern
+ // date (Rel. N, Created)
+ // date (Rel. N, Last updated, Version M)
+ protected static final Pattern dp = Pattern.compile("([^\\s]+)\\s*(\\(Rel\\.\\s+(\\d+), ([^\\)\\d]+)(\\d*)\\))?$");
+ // locus line
+ protected static final Pattern lp = Pattern.compile("^(\\S+);\\s+SV\\s+(\\d+);\\s+(linear|circular);\\s+(\\S+\\s?\\S+?);\\s+(\\S+);\\s+(\\S+);\\s+(\\d+)\\s+(BP|AA)\\.$");
+ protected static final Pattern lpPre87 = Pattern.compile("^(\\S+)\\s+standard;\\s+(circular)?\\s*(genomic)?\\s*(\\S+);\\s+(\\S+);\\s+\\d+\\s+BP\\.$");
+ // version line
+ protected static final Pattern vp = Pattern.compile("^(\\S+?)\\.(\\d+)$");
+ // reference position line
+ protected static final Pattern rpp = Pattern.compile("^(\\d+)(-(\\d+))?,?(\\s?\\d+-\\d+,?)*$");
+ // dbxref line
+ protected static final Pattern dbxp = Pattern.compile("^([^:]+):(\\S+)$");
+
+ protected static final Pattern readableFileNames = Pattern.compile(".*\\u002e(em|dat).*");
+ protected static final Pattern headerLine = Pattern.compile("^ID.*");
+
+ private NCBITaxon tax = null;
+ private String organism = null;
+ private String accession = null;
+
+ /**
+ * Implements some EMBL-specific terms.
+ */
+ public static class Terms extends RichSequence.Terms {
+
+ /**
+ * Getter for the RelUpdatedRecordVersion term
+ * @return The RelUpdatedRecordVersion Term
+ */
+ public static ComparableTerm getRelUpdatedRecordVersionTerm() {
+ return RichObjectFactory.getDefaultOntology().getOrCreateTerm("RelUpdatedRecordVersion");
+ }
+
+ /**
+ * Getter for the EMBL term
+ * @return The EMBL Term
+ */
+ public static ComparableTerm getEMBLTerm() {
+ return RichObjectFactory.getDefaultOntology().getOrCreateTerm("EMBL");
+ }
+
+ /**
+ * Getter for the Ensembl-specific 'genomic' term
+ * @return The genomic Term
+ */
+ public static ComparableTerm getGenomicTerm() {
+ return RichObjectFactory.getDefaultOntology().getOrCreateTerm("genomic");
+ }
+
+ /**
+ * Getter for the Ensembl-specific 'versionLine' term
+ * @return The version line Term
+ */
+ public static ComparableTerm getVersionLineTerm() {
+ return RichObjectFactory.getDefaultOntology().getOrCreateTerm("versionLine");
+ }
+
+ /**
+ * Getter for the Ensembl-specific 'dataClass' term
+ * @return The data class Term
+ */
+ public static ComparableTerm getDataClassTerm() {
+ return RichObjectFactory.getDefaultOntology().getOrCreateTerm("dataClass");
+ }
+
+ /**
+ * Getter for the Ensembl-specific 'organism' term
+ * @return The organism Term - "ORGANISM_TAG"
+ * added by xiujuan 2016-1-28
+ */
+ public static ComparableTerm getOrganismTerm(){
+ return RichObjectFactory.getDefaultOntology().getOrCreateTerm("organism");
+ }
+
+ /**
+ * @return The length
+ * added by xiujuan 2016-1-28
+ */
+ public static ComparableTerm getLengthTerm(){
+ return RichObjectFactory.getDefaultOntology().getOrCreateTerm("length");
+ }
+ }
+
+ /**
+ * {@inheritDoc}
+ * A file is in EMBL format if its name contains the word eem or edat, or the first line matches
+ * the EMBL format for the ID line.
+ */
+ public boolean canRead(File file) throws IOException {
+ if (readableFileNames.matcher(file.getName()).matches()) return true;
+ BufferedReader br = new BufferedReader(new FileReader(file));
+ String firstLine = br.readLine();
+ boolean readable = firstLine!=null && headerLine.matcher(firstLine).matches() &&
+ (lp.matcher(firstLine.substring(3).trim()).matches() ||
+ lpPre87.matcher(firstLine.substring(3).trim()).matches()
+ );
+ br.close();
+ return readable;
+ }
+
+ /**
+ * {@inheritDoc}
+ * Always returns a DNA tokenizer.
+ */
+ public SymbolTokenization guessSymbolTokenization(File file) throws IOException {
+ return RichSequence.IOTools.getDNAParser();
+ }
+
+ /**
+ * {@inheritDoc}
+ * A stream is in EMBL format if its first line matches the EMBL format for the ID line.
+ */
+ public boolean canRead(BufferedInputStream stream) throws IOException {
+ stream.mark(2000); // some streams may not support this
+ BufferedReader br = new BufferedReader(new InputStreamReader(stream));
+ String firstLine = br.readLine();
+ boolean readable = firstLine!=null && headerLine.matcher(firstLine).matches() &&
+ (lp.matcher(firstLine.substring(3).trim()).matches() ||
+ lpPre87.matcher(firstLine.substring(3).trim()).matches()
+ );
+ // don't close the reader as it'll close the stream too.
+ // br.close();
+ stream.reset();
+ return readable;
+ }
+
+ /**
+ * {@inheritDoc}
+ * Always returns a DNA tokenizer.
+ */
+ public SymbolTokenization guessSymbolTokenization(BufferedInputStream stream) throws IOException {
+ return RichSequence.IOTools.getDNAParser();
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ public boolean readSequence(BufferedReader reader,
+ SymbolTokenization symParser,
+ SeqIOListener listener)
+ throws IllegalSymbolException, IOException, ParseException {
+ if (!(listener instanceof RichSeqIOListener)) throw new IllegalArgumentException("Only accepting RichSeqIOListeners today");
+ return this.readRichSequence(reader,symParser,(RichSeqIOListener)listener,null);
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ public boolean readRichSequence(BufferedReader reader,
+ SymbolTokenization symParser,
+ RichSeqIOListener rlistener,
+ Namespace ns)
+ throws IllegalSymbolException, IOException, ParseException {
+ tax = null;
+ organism = null;
+ accession = null;
+ boolean hasAnotherSequence = true;
+ //boolean hasInternalWhitespace = false;
+
+ rlistener.startSequence();
+
+ if (ns==null) ns=RichObjectFactory.getDefaultNamespace();
+ rlistener.setNamespace(ns);
+
+ // Get an ordered list of key->value pairs in array-tuples
+ String sectionKey = null;
+ do {
+ List section = this.readSection(reader);
+ sectionKey = ((String[])section.get(0))[0];
+ if(sectionKey == null){
+
+ String message = ParseException.newMessage(this.getClass(), accession, "No section key", "Not set", sectionToString(section));
+ System.err.println("error happens: " + message);
+ throw new ParseException(message);
+ }
+ // process section-by-section
+ if (sectionKey.equals(LOCUS_TAG)) {
+ // entryname dataclass; [circular] molecule; division; sequencelength BP.
+ String loc = ((String[])section.get(0))[1];
+ Matcher m = lp.matcher(loc);
+ Matcher mPre87 = lpPre87.matcher(loc);
+ if (m.matches()) {
+ // first token is both name and primary accession
+ rlistener.setName(m.group(1));
+ rlistener.setAccession(m.group(1));
+ // second token is version
+ rlistener.setVersion(Integer.parseInt(m.group(2)));
+ // third token is circular/linear
+ rlistener.setCircular(m.group(3).equals("circular"));
+ // fourth token is moltype
+ rlistener.addSequenceProperty(Terms.getMolTypeTerm(),m.group(4));
+ // fifth token is data class
+ rlistener.addSequenceProperty(Terms.getDataClassTerm(),m.group(5));
+ // sixth token is taxonomic division
+ rlistener.setDivision(m.group(6));
+ // seventh token is sequence length, which is ignored
+ // as it is calculated from the sequence data later.
+ } else if (mPre87.matches()) {
+ rlistener.setName(mPre87.group(1));
+ if (mPre87.group(3)!=null) {
+ // add annotation for 'genomic' (Ensembl-specific term)
+ rlistener.addSequenceProperty(Terms.getGenomicTerm(),null);
+ }
+ rlistener.addSequenceProperty(Terms.getMolTypeTerm(),mPre87.group(4));
+ rlistener.setDivision(mPre87.group(5));
+ // Optional extras
+ String circular = mPre87.group(2);
+ if (circular!=null) rlistener.setCircular(true);
+ } else {
+ String message = ParseException.newMessage(this.getClass(),accession,"Not Set","Bad ID line found", sectionToString(section));
+ System.err.println("error happens: " + message);
+ throw new ParseException(message);
+ }
+ } else if (sectionKey.equals(DEFINITION_TAG)) {
+ rlistener.setDescription(((String[])section.get(0))[1]);
+ } else if (sectionKey.equals(SOURCE_TAG)) {
+ // only interested in organelle sub-tag
+ for (int i = 1; i < section.size(); i++) {
+ sectionKey = ((String[])section.get(i))[0];
+ if (sectionKey.equals(ORGANELLE_TAG)) {
+ rlistener.addSequenceProperty(Terms.getOrganelleTerm(), ((String[])section.get(i))[1].trim());
+ break; // skip out of for loop once found
+ }
+ if(sectionKey.equals(ORGANISM_TAG)){
+ rlistener.addSequenceProperty(Terms.getOrganismTerm(), ((String[])section.get(i))[1].trim());
+ break;
+ }
+ }
+ } else if (sectionKey.equals(DATE_TAG)) {
+ String chunk = ((String[])section.get(0))[1].trim();
+ Matcher dm = dp.matcher(chunk);
+ if (dm.matches()) {
+ String date = dm.group(1);
+ String rel = dm.group(3);
+ String type = dm.group(4);
+ if (type.equals("Created")) {
+ rlistener.addSequenceProperty(Terms.getDateCreatedTerm(), date);
+ rlistener.addSequenceProperty(Terms.getRelCreatedTerm(), rel);
+ } else if (type.equals("Last updated, Version ")) {
+ rlistener.addSequenceProperty(Terms.getDateUpdatedTerm(), date);
+ rlistener.addSequenceProperty(Terms.getRelUpdatedTerm(), rel);
+ rlistener.addSequenceProperty(Terms.getRelUpdatedRecordVersionTerm(), dm.group(5));
+ } else {
+ String message = ParseException.newMessage(this.getClass(),accession,"not set", "Bad date type found",sectionToString(section));
+ System.err.println("error happens: " + message);
+ throw new ParseException(message);
+ }
+ } else {
+ String message = ParseException.newMessage(this.getClass(),accession,"not set", "Bad date line found",sectionToString(section));
+ System.err.println("error happens: " + message);
+ throw new ParseException(message);
+
+ }
+ } else if (sectionKey.equals(ACCESSION_TAG)) {
+ // if multiple accessions, store only first as accession,
+ // and store rest in annotation
+ String[] accs = ((String[])section.get(0))[1].split(";");
+ accession = accs[0].trim();
+ rlistener.setAccession(accession);
+ for (int i = 1; i < accs.length; i++) {
+ rlistener.addSequenceProperty(Terms.getAdditionalAccessionTerm(),accs[i].trim());
+ }
+ } else if (sectionKey.equals(VERSION_TAG)) {
+ String ver = ((String[])section.get(0))[1];
+ Matcher m = vp.matcher(ver);
+ if (m.matches()) {
+ String verAcc = m.group(1);
+ if (!accession.equals(verAcc)) {
+ // the version refers to a different accession!
+ // believe the version line, and store the original
+ // accession away in the additional accession set
+ rlistener.addSequenceProperty(Terms.getAdditionalAccessionTerm(),accession);
+ accession = verAcc;
+ rlistener.setAccession(accession);
+ }
+ rlistener.setVersion(Integer.parseInt(m.group(2)));
+ } else {
+ rlistener.addSequenceProperty(Terms.getVersionLineTerm(),ver);
+ }
+ } else if (sectionKey.equals(KEYWORDS_TAG)) {
+ String val = ((String[])section.get(0))[1];
+ val = val.substring(0,val.length()-1); // chomp dot
+ val = val.replace('\n',' '); //remove newline
+ String[] kws = val.split(";");
+ for (int i = 0; i < kws.length; i++) {
+ String kw = kws[i].trim();
+ if (kw.length()==0) continue;
+ rlistener.addSequenceProperty(Terms.getKeywordTerm(), kw);
+ }
+ } else if (sectionKey.equals(DATABASE_XREF_TAG)) {
+ String val = ((String[])section.get(0))[1];
+ val = val.substring(0,val.length()-1); // chomp dot
+ // database_identifier; primary_identifier; secondary_identifier....
+ String[] parts = val.split(";");
+ // construct a DBXREF out of the dbname part[0] and accession part[1]
+ CrossRef crossRef = (CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{parts[0].trim(),parts[1].trim(), new Integer(0)});
+ // assign remaining bits of info as annotations
+ for (int j = 2; j < parts.length; j++) {
+ Note note = new SimpleNote(Terms.getAdditionalAccessionTerm(),parts[j].trim(),j-1);
+ try {
+ crossRef.getRichAnnotation().addNote(note);
+ } catch (ChangeVetoException ce) {
+ String message = ParseException.newMessage(this.getClass(),accession,"not set", "Could not annotate identifier terms",sectionToString(section));
+ ParseException pe = new ParseException(message);
+ System.err.println("error happens: " + message);
+ pe.initCause(ce);
+ throw pe;
+ }
+ }
+ RankedCrossRef rcrossRef = new SimpleRankedCrossRef(crossRef, 0);
+ rlistener.setRankedCrossRef(rcrossRef);
+ } else if (sectionKey.equals(REFERENCE_TAG) && !this.getElideReferences()) {
+ // first line of section has rank and location
+ String refrank = ((String[])section.get(0))[1];
+ int ref_rank = Integer.parseInt(refrank.substring(1,refrank.length()-1));
+ int ref_start = -999;
+ int ref_end = -999;
+ // rest can be in any order
+ String consortium = null;
+ String authors = "";
+ String title = null;
+ String locator = null;
+ String pubmed = null;
+ String medline = null;
+ String doi = null;
+ String remark = null;
+ for (int i = 1; i < section.size(); i++) {
+ String key = ((String[])section.get(i))[0];
+ String val = ((String[])section.get(i))[1];
+ if (key.equals(AUTHORS_TAG)) {
+ if (val.endsWith(";")) val = val.substring(0,val.length()-1); // chomp semicolon
+ authors = val.replace('\n',' '); //see #2276
+ }else if (key.equals(CONSORTIUM_TAG)) {
+ if (val.endsWith(";")) val = val.substring(0,val.length()-1); // chomp semicolon
+ consortium = val.replace('\n',' '); //see #2276
+ }else if (key.equals(TITLE_TAG)) {
+ if (val.length()>1) {
+ if (val.endsWith(";")) val = val.substring(0,val.length()-1); // chomp semicolon
+ if (val.endsWith("\"")) val = val.substring(1,val.length()-1); // chomp quotes
+ title = val.replace('\n',' '); //see #2276
+ } else title=null; // single semi-colon indicates no title
+ }else if (key.equals(LOCATOR_TAG)) {
+ if (val.endsWith(".")) val = val.substring(0,val.length()-1); // chomp dot
+ locator = val.replace('\n',' '); //see #2276
+ }else if (key.equals(REFERENCE_XREF_TAG)) {
+ // database_identifier; primary_identifier.
+ String[] refs = val.split("\\.(\\s+|$)");
+ for (int j = 0 ; j < refs.length; j++) {
+ if (refs[j].trim().length()==0) continue;
+ String[] parts = refs[j].split(";");
+ if(parts.length == 2){
+ String db = parts[0];
+ String ref = parts[1].trim();
+ if (db.equalsIgnoreCase(Terms.PUBMED_KEY)) pubmed = ref;
+ else if (db.equalsIgnoreCase(Terms.MEDLINE_KEY)) medline = ref;
+ else if (db.equalsIgnoreCase(Terms.DOI_KEY)) doi = ref;
+ }
+ }
+ }else if (key.equals(REMARK_TAG)) remark = val.replace('\n',' '); //see #2276
+ else if (key.equals(REFERENCE_POSITION_TAG)) {
+ // only the first group is taken
+ // if we have multiple lines, only the last line is taken
+ Matcher m = rpp.matcher(val);
+ if (m.matches()) {
+ ref_start = Integer.parseInt(m.group(1));
+ if(m.group(2) != null)
+ ref_end = Integer.parseInt(m.group(3));
+ } else {
+ String message = ParseException.newMessage(this.getClass(),accession,"not set", "Bad reference line found",sectionToString(section));
+ System.err.println("error happens: " + message);
+ throw new ParseException(message);
+ }
+ }
+ }
+ // create the docref object
+ try {
+ List authSet = DocRefAuthor.Tools.parseAuthorString(authors);
+ if (consortium!=null) authSet.add(new SimpleDocRefAuthor(consortium, true, false));
+ DocRef dr = (DocRef)RichObjectFactory.getObject(SimpleDocRef.class,new Object[]{authSet,locator,title});
+ // assign either the pubmed or medline to the docref - medline gets priority, then pubmed, then doi
+ if (medline!=null) dr.setCrossref((CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{Terms.MEDLINE_KEY, medline, new Integer(0)}));
+ else if (pubmed!=null) dr.setCrossref((CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{Terms.PUBMED_KEY, pubmed, new Integer(0)}));
+ else if (doi!=null) dr.setCrossref((CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{Terms.DOI_KEY, doi, new Integer(0)}));
+ // assign the remarks
+ if (!this.getElideComments()) dr.setRemark(remark);
+ // assign the docref to the bioentry
+ RankedDocRef rdr = new SimpleRankedDocRef(dr,
+ (ref_start != -999 ? new Integer(ref_start) : null),
+ (ref_end != -999 ? new Integer(ref_end) : null),
+ ref_rank);
+ rlistener.setRankedDocRef(rdr);
+ rlistener.setRankedDocRef(rdr);
+ } catch (ChangeVetoException e) {
+ String message = ParseException.newMessage(this.getClass(),accession,"not set", "",sectionToString(section));
+ System.err.println("error happens: " + message);
+ throw new ParseException(e, message);
+ }
+ } else if (sectionKey.equals(COMMENT_TAG) && !this.getElideComments()) {
+ // Set up some comments
+ rlistener.setComment(((String[])section.get(0))[1]);
+ } else if (sectionKey.equals(FEATURE_TAG) && !this.getElideFeatures()) {
+ // starting from second line of input, start a new feature whenever we come across
+ // a key that does not start with /
+ boolean seenAFeature = false;
+ int rcrossrefCount = 0;
+ boolean skippingBond = false;
+ for (int i = 1 ; i < section.size(); i++) {
+ String key = ((String[])section.get(i))[0];
+ String val = ((String[])section.get(i))[1];
+ if (key.startsWith("/")) {
+ if(!skippingBond){
+ key = key.substring(1); // strip leading slash
+ val = val.replaceAll("\\s*[\\n\\r]+\\s*"," ").trim();
+ if (val.startsWith("\"")) val = val.substring(1,val.length()-1); // strip quotes
+ // parameter on old feature
+ if (key.equalsIgnoreCase("db_xref")) {
+ Matcher m = dbxp.matcher(val);
+ if (m.matches()) {
+ String dbname = m.group(1);
+ String raccession = m.group(2);
+ if (dbname.equalsIgnoreCase("taxon")) {
+ // Set the Taxon instead of a dbxref
+ tax = (NCBITaxon)RichObjectFactory.getObject(SimpleNCBITaxon.class, new Object[]{Integer.valueOf(raccession)});
+ rlistener.setTaxon(tax);
+ try {
+ if (organism!=null) tax.addName(NCBITaxon.SCIENTIFIC,organism);
+ } catch (ChangeVetoException e) {
+ String message = ParseException.newMessage(this.getClass(),accession,"not set", "",sectionToString(section));
+ System.err.println("error happens: " + message);
+ throw new ParseException(e, message);
+ }
+ } else {
+ try {
+ CrossRef cr = (CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{dbname, raccession, new Integer(0)});
+ RankedCrossRef rcr = new SimpleRankedCrossRef(cr, ++rcrossrefCount);
+ rlistener.getCurrentFeature().addRankedCrossRef(rcr);
+ } catch (ChangeVetoException e) {
+ String message = ParseException.newMessage(this.getClass(),accession,"not set", "",sectionToString(section));
+ System.err.println("error happens: " + message);
+ throw new ParseException(e, message);
+ }
+ }
+ } else {
+ String message = ParseException.newMessage(this.getClass(),accession,"not set", "Bad dbxref found",sectionToString(section));
+ System.err.println("error happens: " + message);
+ throw new ParseException(message);
+ }
+ } else if (key.equalsIgnoreCase("organism")) {
+ try {
+ organism = val;
+ if (tax!=null) tax.addName(NCBITaxon.SCIENTIFIC,organism);
+ } catch (ChangeVetoException e) {
+ String message = ParseException.newMessage(this.getClass(),accession,"not set", "",sectionToString(section));
+ System.err.println("error happens: " + message);
+ throw new ParseException(message);
+ }
+ } else {
+ if (key.equalsIgnoreCase("translation")) {
+ // strip spaces from sequence
+ val = val.replaceAll("\\s+","");
+ }
+ rlistener.addFeatureProperty(RichObjectFactory.getDefaultOntology().getOrCreateTerm(key),val);
+ }
+ }
+ } else {
+ // new feature!
+ // end previous feature
+ if(key.equalsIgnoreCase("bond"))
+ {
+ skippingBond = true;
+ }else{
+ skippingBond = false;
+ if (seenAFeature) {
+ rlistener.endFeature();
+ }
+ // start next one, with lots of lovely info in it
+ RichFeature.Template templ = new RichFeature.Template();
+ templ.annotation = new SimpleRichAnnotation();
+ templ.sourceTerm = Terms.getEMBLTerm();
+ templ.typeTerm = RichObjectFactory.getDefaultOntology().getOrCreateTerm(key);
+ templ.featureRelationshipSet = new TreeSet();
+ templ.rankedCrossRefs = new TreeSet();
+ String tidyLocStr = val.replaceAll("\\s+","");
+ templ.location = GenbankLocationParser.parseLocation(ns, accession, tidyLocStr);
+ if(!(templ.location instanceof MultiSourceCompoundRichLocation)){
+ rlistener.startFeature(templ);
+ seenAFeature = true;
+ rcrossrefCount = 0;
+ }else{
+ System.err.println("encounter a MultiSourceCompoundRichLocation instance");
+ skippingBond = true;
+ seenAFeature = false;
+ }
+ }
+ }
+ }
+ if (seenAFeature) rlistener.endFeature();
+ } else if (sectionKey.equals(START_SEQUENCE_TAG) && !this.getElideSymbols()) {
+ StringBuffer seq = new StringBuffer();
+ for (int i = 0 ; i < section.size(); i++) seq.append(((String[])section.get(i))[1]);
+ try {
+ SymbolList sl = new SimpleSymbolList(symParser,
+ seq.toString().replaceAll("\\s+","").replaceAll("[\\.|~]","-"));
+ rlistener.addSymbols(symParser.getAlphabet(),
+ (Symbol[])(sl.toList().toArray(new Symbol[0])),
+ 0, sl.length());
+ } catch (Exception e) {
+ String message = ParseException.newMessage(this.getClass(),accession,"not set", "Bad sequence",sectionToString(section));
+ System.err.println("error happens: " + message);
+ throw new ParseException(e, message);
+ }
+ }
+ } while (!sectionKey.equals(END_SEQUENCE_TAG));
+
+ // Allows us to tolerate trailing whitespace without
+ // thinking that there is another Sequence to follow
+ while (true) {
+ reader.mark(1);
+ int c = reader.read();
+ if (c == -1) {
+ hasAnotherSequence = false;
+ break;
+ }
+ if (Character.isWhitespace((char) c)) {
+ //hasInternalWhitespace = true;
+ continue;
+ }
+ //if (hasInternalWhitespace)
+ // System.err.println("Warning: whitespace found between sequence entries");
+ reader.reset();
+ break;
+ }
+
+ // Finish up.
+ rlistener.endSequence();
+ return hasAnotherSequence;
+ }
+
+ // reads an indented section, combining split lines and creating a list of key->value tuples
+ private List readSection(BufferedReader br) throws ParseException {
+ List section = new ArrayList();
+ String line;
+ boolean done = false;
+
+ // while not done
+ try {
+ while (!done) {
+ // mark buffer
+ br.mark(320);
+ // read token
+ line = br.readLine();
+ if (line.length()<2) {
+ String message = ParseException.newMessage(this.getClass(),accession,"not set", "Bad line found",line);
+ System.err.println("error happens: " + message);
+ throw new ParseException(message);
+ }
+ String token = line.substring(0,2);
+ // READ SEQUENCE SECTION
+ if (token.equals(START_SEQUENCE_TAG)) {
+ // from next line, read sequence until // - leave // on stack
+ StringBuffer sb = new StringBuffer();
+ while (!done) {
+ br.mark(160);
+ line = br.readLine();
+ if (line.startsWith(END_SEQUENCE_TAG)) {
+ br.reset();
+ done = true;
+ } else {
+ // create sequence tag->value pair to return, sans numbers
+ sb.append(line.replaceAll("\\d",""));
+ }
+ }
+ section.add(new String[]{START_SEQUENCE_TAG,sb.toString()});
+ }
+ // READ FEATURE TABLE SECTION
+ else if (token.equals(FEATURE_HEADER_TAG)) {
+ // create dummy feature tag->value pair and add to return set
+ section.add(new String[]{FEATURE_TAG,null});
+ // drop next FH line
+ line = br.readLine(); // skip next line too - it is also FH
+ // read all FT lines until XX
+ String currentTag = null;
+ StringBuffer currentVal = null;
+ while (!done) {
+ line = br.readLine();
+ if (line.startsWith(DELIMITER_TAG)) {
+ done = true;
+ // dump current tag if exists
+ if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()});
+ } else {
+ // FT lines: FT word value
+ // or FT /word
+ // or FT /db_xref="taxon:3899....
+ // ......"
+ line = line.substring(5); // chomp off "FT "
+ if (!line.startsWith(" ")) {
+ // dump current tag if exists
+ if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()});
+ // case 1 : word value - splits into key-value on its own
+ String[] parts = line.trim().split("\\s+");
+ currentTag = parts[0];
+ currentVal = new StringBuffer();
+ currentVal.append(parts[1]);
+ } else {
+ line = line.trim();
+ if (line.startsWith("/")) {
+ // dump current tag if exists
+ if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()});
+ // case 2 : /word[=.....]
+ currentVal = new StringBuffer();
+ int equalIndex = line.indexOf('=');
+ if (equalIndex>=0) {
+ currentTag = line.substring(0, equalIndex);
+ currentVal.append(line.substring(equalIndex+1));
+ } else {
+ currentTag = line;
+ }
+ } else {
+ // case 3 : ...."
+ currentVal.append("\n");
+ currentVal.append(line);
+ }
+ }
+ }
+ }
+ }
+ // READ END OF SEQUENCE
+ else if (token.equals(END_SEQUENCE_TAG)) {
+ section.add(new String[]{END_SEQUENCE_TAG,null});
+ done = true;
+ }
+ // READ DELIMITER TAG
+ else if (token.equals(DELIMITER_TAG)) {
+ section.add(new String[]{DELIMITER_TAG,null});
+ done = true;
+ }
+ // READ THIRD PARTY ANNOTATION SECTION
+ else if (token.equals(TPA_TAG)) {
+ // exception = don't know how to do TPA yet
+ // TODO: 2016/6/27 run into here with accession BK000583, HE580237
+ /*String message = ParseException.newMessage(this.getClass(),accession,"not set", "Unable to handle TPAs just yet",sectionToString(section));
+ System.err.println("error happens: " + message);
+ throw new ParseException(message);*/
+ section.add(new String[]{TPA_TAG, null});
+ done = true;
+ }
+ // READ CONTIG SECTION
+ //else if (token.equals(CONTIG_TAG)) {
+ // exception = don't know how to do contigs yet
+ //String message = ParseException.newMessage(this.getClass(),accession,"not set", "Unable to handle contig assemblies just yet",sectionToString(section));
+ //throw new ParseException(message);
+ //}
+ //2016.1.27 modified by Xiujuan for parsing file, file containing CONTIG_TAG
+ else if (token.equals(CONTIG_TAG)) {
+ section.add(new String[]{CONTIG_TAG,null});
+ done = true;
+ }
+ // READ DOCREF
+ else if (token.equals(DATABASE_XREF_TAG)) {
+ section.add(new String[]{DATABASE_XREF_TAG,line.substring(5).trim()});
+ done = true;
+ }
+ // READ DATE
+ else if (token.equals(DATE_TAG)) {
+ section.add(new String[]{DATE_TAG,line.substring(5).trim()});
+ done = true;
+ }
+ // READ NORMAL TAG/VALUE SECTION
+ else {
+ // rewind buffer to mark
+ br.reset();
+ // read token/values until XX
+ String currentTag = null;
+ StringBuffer currentVal = null;
+ while (!done) {
+ line = br.readLine();
+ if (line.startsWith(DELIMITER_TAG)) {
+ done = true;
+ // dump current tag if exists
+ if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()});
+ } else {
+ try {
+ // merge neighbouring repeated tokens by concatting values
+ // return tag->value pairs
+ String tag = line.substring(0,2);
+ String value = line.substring(5);
+ if (currentTag==null || !tag.equals(currentTag)) {
+ // dump current tag if exists
+ if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()});
+ // start new tag
+ currentTag = tag;
+ currentVal = new StringBuffer();
+ currentVal.append(value);
+ } else {
+ currentVal.append("\n");
+ currentVal.append(value);
+ }
+ } catch (Exception e) {
+ String message = ParseException.newMessage(this.getClass(), accession, "not set","",sectionToString(section));
+ System.err.println("error happens: " + message);
+ throw new ParseException(e, message);
+ }
+ }
+ }
+ }
+ }
+ } catch (IOException e) {
+ String message = ParseException.newMessage(this.getClass(),accession,"not set", "Unable to handle TPAs just yet",sectionToString(section));
+ System.err.println("error happens: " + message);
+ throw new ParseException(message);
+ }
+ return section;
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ public void writeSequence(Sequence seq, PrintStream os) throws IOException {
+ if (this.getPrintStream()==null) this.setPrintStream(os);
+ this.writeSequence(seq, RichObjectFactory.getDefaultNamespace());
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ public void writeSequence(Sequence seq, String format, PrintStream os) throws IOException {
+ if (this.getPrintStream()==null) this.setPrintStream(os);
+ this.writeSequence(seq, format, RichObjectFactory.getDefaultNamespace());
+ }
+
+ /**
+ * {@inheritDoc}
+ * Namespace is ignored as EMBL has no concept of it.
+ */
+ public void writeSequence(Sequence seq, Namespace ns) throws IOException {
+ this.writeSequence(seq, this.getDefaultFormat(), ns);
+ }
+
+ /**
+ * As per {@link #writeSequence(Sequence, Namespace)}, except
+ * that it also takes a format parameter. This can be any of the formats
+ * defined as constants in this class.
+ * @param seq see {@link #writeSequence(Sequence, Namespace)}
+ * @param format the format to use.
+ * @param ns see {@link #writeSequence(Sequence, Namespace)}
+ * @throws IOException see {@link #writeSequence(Sequence, Namespace)}
+ */
+ public void writeSequence(Sequence seq, String format, Namespace ns) throws IOException {
+ if (!format.equals(EMBL_FORMAT) && !format.equals(EMBL_PRE87_FORMAT))
+ throw new IllegalArgumentException("Format "+format+" not recognised.");
+
+ RichSequence rs;
+ try {
+ if (seq instanceof RichSequence) rs = (RichSequence)seq;
+ else rs = RichSequence.Tools.enrich(seq);
+ } catch (ChangeVetoException e) {
+ IOException e2 = new IOException("Unable to enrich sequence");
+ e2.initCause(e);
+ throw e2;
+ }
+
+ SymbolTokenization tok;
+ try {
+ tok = rs.getAlphabet().getTokenization("token");
+ } catch (Exception e) {
+ throw new RuntimeException("Unable to get alphabet tokenizer",e);
+ }
+
+ Set notes = rs.getNoteSet();
+ String accession = rs.getAccession();
+ StringBuffer accessions = new StringBuffer();
+ accessions.append(accession);
+ accessions.append(";");
+ String cdat = null;
+ String udat = null;
+ String crel = null;
+ String urel = null;
+ String urecv = null;
+ String organelle = null;
+ String versionLine = null;
+ String dataClass = "STD";
+ boolean genomic = false;
+ String moltype = rs.getAlphabet().getName();
+ for (Iterator i = notes.iterator(); i.hasNext(); ) {
+ Note n = i.next();
+ if (n.getTerm().equals(Terms.getDateCreatedTerm())) cdat=n.getValue();
+ else if (n.getTerm().equals(Terms.getDateUpdatedTerm())) udat=n.getValue();
+ else if (n.getTerm().equals(Terms.getRelCreatedTerm())) crel=n.getValue();
+ else if (n.getTerm().equals(Terms.getRelUpdatedTerm())) urel=n.getValue();
+ else if (n.getTerm().equals(Terms.getRelUpdatedRecordVersionTerm())) urecv=n.getValue();
+ else if (n.getTerm().equals(Terms.getMolTypeTerm())) moltype=n.getValue();
+ else if (n.getTerm().equals(Terms.getVersionLineTerm())) versionLine=n.getValue();
+ else if (n.getTerm().equals(Terms.getGenomicTerm())) genomic = true;
+ else if (n.getTerm().equals(Terms.getDataClassTerm())) dataClass = n.getValue();
+ else if (n.getTerm().equals(Terms.getAdditionalAccessionTerm())) {
+ accessions.append(" ");
+ accessions.append(n.getValue());
+ accessions.append(";");
+ } else if (n.getTerm().equals(Terms.getOrganelleTerm())) organelle=n.getValue();
+ }
+
+ StringBuffer locusLine = new StringBuffer();
+ // Division cannot be null
+ String div = rs.getDivision();
+ if(div==null || div.length()==0 || div.length()>3)
+ div = "UNC"; //Unclassified
+
+ if (format.equals(EMBL_FORMAT)) {
+ // accession; SV version; circular/linear; moltype; dataclass; division; length BP.
+ locusLine.append(rs.getAccession());
+ locusLine.append("; SV ");
+ locusLine.append(rs.getVersion());
+ locusLine.append("; ");
+ locusLine.append(rs.getCircular()?"circular":"linear");
+ locusLine.append("; ");
+ locusLine.append(moltype);
+ locusLine.append("; ");
+ locusLine.append(dataClass);
+ locusLine.append("; ");
+ locusLine.append(div);
+ locusLine.append("; ");
+ locusLine.append(rs.length());
+ locusLine.append(" BP.");
+ } else if (format.equals(EMBL_PRE87_FORMAT)) {
+ // entryname dataclass; [circular] molecule; division; sequencelength BP.
+ locusLine.append(StringTools.rightPad(rs.getName(), 9));
+ locusLine.append(" standard; ");
+ locusLine.append(rs.getCircular()?"circular ":"");
+ // if it is Ensembl genomic, add that in too
+ if (genomic==true) locusLine.append("genomic ");
+ locusLine.append(moltype);
+ locusLine.append("; ");
+ locusLine.append(div);
+ locusLine.append("; ");
+ locusLine.append(rs.length());
+ locusLine.append(" BP.");
+ }
+ StringTools.writeKeyValueLine(LOCUS_TAG, locusLine.toString(), 5, this.getLineWidth(), null, LOCUS_TAG, this.getPrintStream());
+ this.getPrintStream().println(DELIMITER_TAG+" ");
+
+ // accession line
+ StringTools.writeKeyValueLine(ACCESSION_TAG, accessions.toString(), 5, this.getLineWidth(), null, ACCESSION_TAG, this.getPrintStream());
+ this.getPrintStream().println(DELIMITER_TAG+" ");
+
+ // version line
+ if (format.equals(EMBL_PRE87_FORMAT)) {
+ if (versionLine!=null) StringTools.writeKeyValueLine(VERSION_TAG, versionLine, 5, this.getLineWidth(), null, VERSION_TAG, this.getPrintStream());
+ else StringTools.writeKeyValueLine(VERSION_TAG, accession+"."+rs.getVersion(), 5, this.getLineWidth(), null, VERSION_TAG, this.getPrintStream());
+ this.getPrintStream().println(DELIMITER_TAG+" ");
+ }
+
+ // date line
+ StringTools.writeKeyValueLine(DATE_TAG, (cdat==null?udat:cdat)+" (Rel. "+(crel==null?"0":crel)+", Created)", 5, this.getLineWidth(), null, DATE_TAG, this.getPrintStream());
+ StringTools.writeKeyValueLine(DATE_TAG, udat+" (Rel. "+(urel==null?"0":urel)+", Last updated, Version "+(urecv==null?"0":urecv)+")", 5, this.getLineWidth(), null, DATE_TAG, this.getPrintStream());
+ this.getPrintStream().println(DELIMITER_TAG+" ");
+
+ // definition line
+ StringTools.writeKeyValueLine(DEFINITION_TAG, rs.getDescription(), 5, this.getLineWidth(), null, DEFINITION_TAG, this.getPrintStream());
+ this.getPrintStream().println(DELIMITER_TAG+" ");
+
+ // keywords line
+ StringBuffer keywords = new StringBuffer();
+ for (Iterator n = notes.iterator(); n.hasNext(); ) {
+ Note nt = n.next();
+ if (nt.getTerm().equals(Terms.getKeywordTerm())) {
+ if (keywords.length()>0) keywords.append("; ");
+ keywords.append(nt.getValue());
+ }
+ }
+ if (keywords.length()>0) {
+ keywords.append(".");
+ StringTools.writeKeyValueLine(KEYWORDS_TAG, keywords.toString(), 5, this.getLineWidth(), null, KEYWORDS_TAG, this.getPrintStream());
+ this.getPrintStream().println(DELIMITER_TAG+" ");
+ } else {
+ this.getPrintStream().println(KEYWORDS_TAG+" .");
+ this.getPrintStream().println(DELIMITER_TAG+" ");
+ }
+
+ // source line (from taxon)
+ // organism line
+ NCBITaxon tax = rs.getTaxon();
+ if (tax!=null) {
+ StringTools.writeKeyValueLine(SOURCE_TAG, tax.getDisplayName(), 5, this.getLineWidth(), null, SOURCE_TAG, this.getPrintStream());
+ StringTools.writeKeyValueLine(ORGANISM_TAG, tax.getNameHierarchy(), 5, this.getLineWidth(), null, SOURCE_TAG, this.getPrintStream());
+ if (organelle!=null) StringTools.writeKeyValueLine(ORGANELLE_TAG, organelle, 5, this.getLineWidth(), null, ORGANELLE_TAG, this.getPrintStream());
+ this.getPrintStream().println(DELIMITER_TAG+" ");
+ }
+
+ // references - rank (bases x to y)
+ for (Iterator r = rs.getRankedDocRefs().iterator(); r.hasNext(); ) {
+ RankedDocRef rdr = r.next();
+ DocRef d = rdr.getDocumentReference();
+ // RN, RC, RP, RX, RG, RA, RT, RL
+ StringTools.writeKeyValueLine(REFERENCE_TAG, "["+rdr.getRank()+"]", 5, this.getLineWidth(), null, REFERENCE_TAG, this.getPrintStream());
+ StringTools.writeKeyValueLine(REMARK_TAG, d.getRemark(), 5, this.getLineWidth(), null, REMARK_TAG, this.getPrintStream());
+ Integer rstart = rdr.getStart();
+ if (rstart==null) rstart = new Integer(1);
+ Integer rend = rdr.getEnd();
+ if (rend==null) rend = new Integer(rs.length());
+ StringTools.writeKeyValueLine(REFERENCE_POSITION_TAG, rstart+"-"+rend, 5, this.getLineWidth(), null, REFERENCE_POSITION_TAG, this.getPrintStream());
+ CrossRef c = d.getCrossref();
+ if (c!=null) StringTools.writeKeyValueLine(REFERENCE_XREF_TAG, c.getDbname()+"; "+c.getAccession()+".", 5, this.getLineWidth(), null, REFERENCE_XREF_TAG, this.getPrintStream());
+ List auths = d.getAuthorList();
+ for (Iterator j = auths.iterator(); j.hasNext(); ) {
+ DocRefAuthor a = j.next();
+ if (a.isConsortium()) {
+ StringTools.writeKeyValueLine(CONSORTIUM_TAG, a+";", 5, this.getLineWidth(), null, CONSORTIUM_TAG, this.getPrintStream());
+ j.remove();
+ }
+ }
+ if (!auths.isEmpty()) StringTools.writeKeyValueLine(AUTHORS_TAG, DocRefAuthor.Tools.generateAuthorString(auths, true)+";", 5, this.getLineWidth(), null, AUTHORS_TAG, this.getPrintStream());
+ else StringTools.writeKeyValueLine(AUTHORS_TAG, ";", 5, this.getLineWidth(), null, AUTHORS_TAG, this.getPrintStream());
+ if (d.getTitle()!=null && d.getTitle().length()!=0) StringTools.writeKeyValueLine(TITLE_TAG, "\""+d.getTitle()+"\";", 5, this.getLineWidth(), null, TITLE_TAG, this.getPrintStream());
+ else StringTools.writeKeyValueLine(TITLE_TAG, ";", 5, this.getLineWidth(), null, TITLE_TAG, this.getPrintStream());
+ StringTools.writeKeyValueLine(LOCATOR_TAG, d.getLocation()+".", 5, this.getLineWidth(), null, LOCATOR_TAG, this.getPrintStream());
+ this.getPrintStream().println(DELIMITER_TAG+" ");
+ }
+
+ // db references - ranked
+ for (Iterator r = rs.getRankedCrossRefs().iterator(); r.hasNext(); ) {
+ RankedCrossRef rcr = r.next();
+ CrossRef c = rcr.getCrossRef();
+ Set noteset = c.getNoteSet();
+ StringBuffer sb = new StringBuffer();
+ sb.append(c.getDbname());
+ sb.append("; ");
+ sb.append(c.getAccession());
+ boolean hasSecondary = false;
+ for (Iterator i = noteset.iterator(); i.hasNext(); ) {
+ Note n = i.next();
+ if (n.getTerm().equals(Terms.getAdditionalAccessionTerm())) {
+ sb.append("; ");
+ sb.append(n.getValue());
+ hasSecondary = true;
+ }
+ }
+ //if (!hasSecondary) sb.append("; -");
+ //sb.append(".");
+ if (!hasSecondary) sb.append(";");
+ else sb.append(".");
+ StringTools.writeKeyValueLine(DATABASE_XREF_TAG, sb.toString(), 5, this.getLineWidth(), null, DATABASE_XREF_TAG, this.getPrintStream());
+ }
+ if (!rs.getRankedCrossRefs().isEmpty())
+ this.getPrintStream().println(DELIMITER_TAG+" ");
+
+ // comments - if any
+ if (!rs.getComments().isEmpty()) {
+ StringBuffer sb = new StringBuffer();
+ for (Iterator i = rs.getComments().iterator(); i.hasNext(); ) {
+ Comment c = i.next();
+ sb.append(c.getComment());
+ if (i.hasNext()) sb.append("\n");
+ }
+ StringTools.writeKeyValueLine(COMMENT_TAG, sb.toString(), 5, this.getLineWidth(), null, COMMENT_TAG, this.getPrintStream());
+ this.getPrintStream().println(DELIMITER_TAG+" ");
+ }
+
+ this.getPrintStream().println(FEATURE_HEADER_TAG+" Key Location/Qualifiers");
+ this.getPrintStream().println(FEATURE_HEADER_TAG+" ");
+ // feature_type location
+ for (Iterator i = rs.getFeatureSet().iterator(); i.hasNext(); ) {
+ RichFeature f = (RichFeature)i.next();
+ StringTools.writeKeyValueLine(FEATURE_TAG+" "+f.getTypeTerm().getName(), GenbankLocationParser.writeLocation((RichLocation)f.getLocation()), 21, this.getLineWidth(), ",", FEATURE_TAG, this.getPrintStream());
+ for (Iterator j = f.getNoteSet().iterator(); j.hasNext(); ) {
+ Note n = j.next();
+ // /key="val" or just /key if val==""
+ if (n.getValue()==null || n.getValue().length()==0) StringTools.writeKeyValueLine(FEATURE_TAG, "/"+n.getTerm().getName(), 21, this.getLineWidth(), null, FEATURE_TAG, this.getPrintStream());
+ else StringTools.writeKeyValueLine(FEATURE_TAG, "/"+n.getTerm().getName()+"=\""+n.getValue()+"\"", 21, this.getLineWidth(), null, FEATURE_TAG, this.getPrintStream());
+ }
+ // add-in to source feature only organism and db_xref="taxon:xyz" where present
+ if (f.getType().equals("source") && tax!=null) {
+ String displayName = tax.getDisplayName();
+ if (displayName.indexOf('(')>-1) displayName = displayName.substring(0, displayName.indexOf('(')).trim();
+ StringTools.writeKeyValueLine(FEATURE_TAG, "/organism=\""+displayName+"\"", 21, this.getLineWidth(), null, FEATURE_TAG, this.getPrintStream());
+ StringTools.writeKeyValueLine(FEATURE_TAG, "/db_xref=\"taxon:"+tax.getNCBITaxID()+"\"", 21, this.getLineWidth(), null, FEATURE_TAG, this.getPrintStream());
+ }
+ // add-in other dbxrefs where present
+ for (Iterator j = f.getRankedCrossRefs().iterator(); j.hasNext(); ) {
+ RankedCrossRef rcr = j.next();
+ CrossRef cr = rcr.getCrossRef();
+ StringTools.writeKeyValueLine(FEATURE_TAG, "/db_xref=\""+cr.getDbname()+":"+cr.getAccession()+"\"", 21, this.getLineWidth(), null, FEATURE_TAG, this.getPrintStream());
+ }
+ }
+ this.getPrintStream().println(DELIMITER_TAG+" ");
+
+ // SQ Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other;
+ int aCount = 0;
+ int cCount = 0;
+ int gCount = 0;
+ int tCount = 0;
+ int oCount = 0;
+ for (int i = 1; i <= rs.length(); i++) {
+ char c;
+ try {
+ c = tok.tokenizeSymbol(rs.symbolAt(i)).charAt(0);
+ } catch (Exception e) {
+ throw new RuntimeException("Unable to get symbol at position "+i,e);
+ }
+ switch (c) {
+ case 'a': case 'A':
+ aCount++;
+ break;
+ case 'c': case 'C':
+ cCount++;
+ break;
+ case 'g': case 'G':
+ gCount++;
+ break;
+ case 't': case 'T':
+ tCount++;
+ break;
+ default:
+ oCount++;
+ }
+ }
+ this.getPrintStream().print(START_SEQUENCE_TAG+" Sequence "+rs.length()+" BP; ");
+ this.getPrintStream().print(aCount + " A; ");
+ this.getPrintStream().print(cCount + " C; ");
+ this.getPrintStream().print(gCount + " G; ");
+ this.getPrintStream().print(tCount + " T; ");
+ this.getPrintStream().println(oCount + " other;");
+
+ // sequence stuff
+ Symbol[] syms = (Symbol[])rs.toList().toArray(new Symbol[0]);
+ int lineLen = 0;
+ int symCount = 0;
+ this.getPrintStream().print(" ");
+ for (int i = 0; i < syms.length; i++) {
+ if (symCount % 60 == 0 && symCount>0) {
+ this.getPrintStream().print(StringTools.leftPad(""+symCount,10));
+ this.getPrintStream().print("\n ");
+ lineLen = 0;
+ }
+ if (symCount % 10 == 0) {
+ this.getPrintStream().print(" ");
+ lineLen++;
+ }
+ try {
+ this.getPrintStream().print(tok.tokenizeSymbol(syms[i]));
+ } catch (IllegalSymbolException e) {
+ throw new RuntimeException("Found illegal symbol: "+syms[i]);
+ }
+ symCount++;
+ lineLen++;
+ }
+ this.getPrintStream().print(StringTools.leftPad(""+symCount,(66-lineLen)+10));
+ this.getPrintStream().print("\n");
+ this.getPrintStream().println(END_SEQUENCE_TAG);
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ public String getDefaultFormat() {
+ return EMBL_FORMAT;
+ }
+
+
+ /**
+ * Converts the current parse section to a String. Useful for debugging.
+ */
+ String sectionToString(List section){
+ StringBuffer parseBlock = new StringBuffer();
+ for(Iterator i = section.listIterator(); i.hasNext();){
+ String[] part = (String[])i.next();
+ for(int x = 0; x < part.length; x++){
+ parseBlock.append(part[x]);
+ if(x == 0){
+ parseBlock.append(" "); //the gap will have been trimmed
+ }
+ }
+ }
+ return parseBlock.toString();
+ }
+}
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/util/CustomEnsemblFormat.java b/piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/util/CustomEnsemblFormat.java
new file mode 100644
index 0000000..c65eb63
--- /dev/null
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/util/CustomEnsemblFormat.java
@@ -0,0 +1,1133 @@
+package cn.piflow.bundle.microorganism.util;
+
+import org.biojava.bio.seq.Sequence;
+import org.biojava.bio.seq.io.ParseException;
+import org.biojava.bio.seq.io.SeqIOListener;
+import org.biojava.bio.seq.io.SymbolTokenization;
+import org.biojava.bio.symbol.IllegalSymbolException;
+import org.biojava.bio.symbol.Symbol;
+import org.biojava.utils.ChangeVetoException;
+import org.biojavax.*;
+import org.biojavax.bio.seq.RichFeature;
+import org.biojavax.bio.seq.RichLocation;
+import org.biojavax.bio.seq.RichSequence;
+import org.biojavax.bio.seq.io.GenbankLocationParser;
+import org.biojavax.bio.seq.io.RichSeqIOListener;
+import org.biojavax.bio.seq.io.RichSequenceFormat;
+import org.biojavax.bio.taxa.NCBITaxon;
+import org.biojavax.bio.taxa.SimpleNCBITaxon;
+import org.biojavax.ontology.ComparableTerm;
+import org.biojavax.utils.StringTools;
+
+import java.io.*;
+import java.util.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Created by xiujuan on 2016/1/27.
+ */
+public class CustomEnsemblFormat extends RichSequenceFormat.HeaderlessFormat {
+ // Register this format with the format auto-guesser.
+ static {
+ RichSequence.IOTools.registerFormat(CustomEMBLFormat.class);
+ }
+
+ /**
+ * The name of the Pre-87 format
+ */
+ public static final String EMBL_PRE87_FORMAT = "EMBL_PRE87";
+
+ /**
+ * The name of the current format
+ */
+ public static final String EMBL_FORMAT = "EMBL";
+
+ protected static final String LOCUS_TAG = "ID";
+ protected static final String ACCESSION_TAG = "AC";
+ protected static final String VERSION_TAG = "SV";
+ protected static final String DEFINITION_TAG = "DE";
+ protected static final String DATE_TAG = "DT";
+ protected static final String DATABASE_XREF_TAG = "DR";
+ protected static final String SOURCE_TAG = "OS";
+ protected static final String ORGANISM_TAG = "OC";
+ protected static final String ORGANELLE_TAG = "OG";
+ protected static final String REFERENCE_TAG = "RN";
+ protected static final String REFERENCE_POSITION_TAG = "RP";
+ protected static final String REFERENCE_XREF_TAG = "RX";
+ protected static final String AUTHORS_TAG = "RA";
+ protected static final String CONSORTIUM_TAG = "RG";
+ protected static final String TITLE_TAG = "RT";
+ protected static final String LOCATOR_TAG = "RL";
+ protected static final String REMARK_TAG = "RC";
+ protected static final String KEYWORDS_TAG = "KW";
+ protected static final String COMMENT_TAG = "CC";
+ protected static final String FEATURE_HEADER_TAG = "FH";
+ protected static final String FEATURE_TAG = "FT";
+ protected static final String CONTIG_TAG = "CO";
+ protected static final String TPA_TAG = "AH";
+ protected static final String START_SEQUENCE_TAG = "SQ";
+ protected static final String DELIMITER_TAG = "XX";
+ protected static final String END_SEQUENCE_TAG = "//";
+
+ // the date pattern Ensembl file
+ protected static final Pattern dp_ensembl = Pattern.compile("([^\\s]+)");
+ // the date pattern
+ // date (Rel. N, Created)
+ // date (Rel. N, Last updated, Version M)
+ protected static final Pattern dp = Pattern.compile("([^\\s]+)\\s*(\\(Rel\\.\\s+(\\d+), ([^\\)\\d]+)(\\d*)\\))?$");
+ // locus line
+ protected static final Pattern lp = Pattern.compile("^(\\S+);\\s+SV\\s+(\\d+);\\s+(linear|circular);\\s+(\\S+\\s?\\S+?);\\s+(\\S+);\\s+(\\S+);\\s+(\\d+)\\s+(BP|AA)\\.$");
+ protected static final Pattern lpPre87 = Pattern.compile("^(\\S+)\\s+standard;\\s+(circular)?\\s*(genomic)?\\s*(\\S+);\\s+(\\S+);\\s+(\\d+)\\s+BP\\.$");
+ //protected static final Pattern ensembl_id = Pattern.compile("^\\S+\\s+\\S+;\\s+\\S+;\\s+\\S+;\\s+(\\d+)\\s+BP\\.$");
+ // version line
+ protected static final Pattern vp = Pattern.compile("^(\\S+?)\\.(\\d+)$");
+ // reference position line
+ protected static final Pattern rpp = Pattern.compile("^(\\d+)(-(\\d+))?,?(\\s\\d+-\\d+,?)*$");
+ // dbxref line
+ protected static final Pattern dbxp = Pattern.compile("^([^:]+):(.+)$");
+
+ protected static final Pattern readableFileNames = Pattern.compile(".*\\u002e(em|dat).*");
+ protected static final Pattern headerLine = Pattern.compile("^ID.*");
+
+ private NCBITaxon tax = null;
+ private String organism = null;
+ private String accession = null;
+
+ /**
+ * Implements some EMBL-specific terms.
+ */
+ public static class Terms extends RichSequence.Terms {
+
+ /**
+ * Getter for the RelUpdatedRecordVersion term
+ * @return The RelUpdatedRecordVersion Term
+ */
+ public static ComparableTerm getRelUpdatedRecordVersionTerm() {
+ return RichObjectFactory.getDefaultOntology().getOrCreateTerm("RelUpdatedRecordVersion");
+ }
+
+ /**
+ * Getter for the EMBL term
+ * @return The EMBL Term
+ */
+ public static ComparableTerm getEMBLTerm() {
+ return RichObjectFactory.getDefaultOntology().getOrCreateTerm("EMBL");
+ }
+
+ /**
+ * Getter for the Ensembl-specific 'genomic' term
+ * @return The genomic Term
+ */
+ public static ComparableTerm getGenomicTerm() {
+ return RichObjectFactory.getDefaultOntology().getOrCreateTerm("genomic");
+ }
+
+ /**
+ * Getter for the Ensembl-specific 'versionLine' term
+ * @return The version line Term
+ */
+ public static ComparableTerm getVersionLineTerm() {
+ return RichObjectFactory.getDefaultOntology().getOrCreateTerm("versionLine");
+ }
+
+ /**
+ * Getter for the Ensembl-specific 'dataClass' term
+ * @return The data class Term
+ */
+ public static ComparableTerm getDataClassTerm() {
+ return RichObjectFactory.getDefaultOntology().getOrCreateTerm("dataClass");
+ }
+
+ /**
+ * Getter for the Ensembl-specific 'organism' term
+ * @return The organism Term - "ORGANISM_TAG"
+ * added by xiujuan 2016-1-28
+ */
+ public static ComparableTerm getOrganismTerm(){
+ return RichObjectFactory.getDefaultOntology().getOrCreateTerm("organism");
+ }
+
+ /**
+ * @return The length
+ * added by xiujuan 2016-1-28
+ */
+ public static ComparableTerm getLengthTerm(){
+ return RichObjectFactory.getDefaultOntology().getOrCreateTerm("length");
+ }
+
+ /**
+ * for the ensembl file "DT" parse
+ * @return The Date
+ * added by xiujuan 2016-1-28
+ */
+ public static ComparableTerm getDateTerm(){
+ return RichObjectFactory.getDefaultOntology().getOrCreateTerm("date");
+ }
+ }
+
+ /**
+ * {@inheritDoc}
+ * A file is in EMBL format if its name contains the word eem or edat, or the first line matches
+ * the EMBL format for the ID line.
+ */
+ public boolean canRead(File file) throws IOException {
+ if (readableFileNames.matcher(file.getName()).matches()) return true;
+ BufferedReader br = new BufferedReader(new FileReader(file));
+ String firstLine = br.readLine();
+ boolean readable = firstLine!=null && headerLine.matcher(firstLine).matches() &&
+ (lp.matcher(firstLine.substring(3).trim()).matches() ||
+ lpPre87.matcher(firstLine.substring(3).trim()).matches()
+ );
+ br.close();
+ return readable;
+ }
+
+ /**
+ * {@inheritDoc}
+ * Always returns a DNA tokenizer.
+ */
+ public SymbolTokenization guessSymbolTokenization(File file) throws IOException {
+ return RichSequence.IOTools.getDNAParser();
+ }
+
+ /**
+ * {@inheritDoc}
+ * A stream is in EMBL format if its first line matches the EMBL format for the ID line.
+ */
+ public boolean canRead(BufferedInputStream stream) throws IOException {
+ stream.mark(2000); // some streams may not support this
+ BufferedReader br = new BufferedReader(new InputStreamReader(stream));
+ String firstLine = br.readLine();
+ boolean readable = firstLine!=null && headerLine.matcher(firstLine).matches() &&
+ (lp.matcher(firstLine.substring(3).trim()).matches() ||
+ lpPre87.matcher(firstLine.substring(3).trim()).matches()
+ );
+ // don't close the reader as it'll close the stream too.
+ // br.close();
+ stream.reset();
+ return readable;
+ }
+
+ /**
+ * {@inheritDoc}
+ * Always returns a DNA tokenizer.
+ */
+ public SymbolTokenization guessSymbolTokenization(BufferedInputStream stream) throws IOException {
+ return RichSequence.IOTools.getDNAParser();
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ public boolean readSequence(BufferedReader reader,
+ SymbolTokenization symParser,
+ SeqIOListener listener)
+ throws IllegalSymbolException, IOException, ParseException {
+ if (!(listener instanceof RichSeqIOListener)) throw new IllegalArgumentException("Only accepting RichSeqIOListeners today");
+ return this.readRichSequence(reader,symParser,(RichSeqIOListener)listener,null);
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ public boolean readRichSequence(BufferedReader reader,
+ SymbolTokenization symParser,
+ RichSeqIOListener rlistener,
+ Namespace ns)
+ throws IllegalSymbolException, IOException, ParseException {
+ tax = null;
+ organism = null;
+ accession = null;
+ boolean hasAnotherSequence = true;
+ //boolean hasInternalWhitespace = false;
+
+ rlistener.startSequence();
+
+ if (ns==null) ns=RichObjectFactory.getDefaultNamespace();
+ rlistener.setNamespace(ns);
+
+ // Get an ordered list of key->value pairs in array-tuples
+ String sectionKey = null;
+ do {
+ List section = this.readSection(reader);
+ sectionKey = ((String[])section.get(0))[0];
+ if(sectionKey == null){
+
+ String message = ParseException.newMessage(this.getClass(), accession, "No section key", "Not set", sectionToString(section));
+ throw new ParseException(message);
+ }
+ // process section-by-section
+ if (sectionKey.equals(LOCUS_TAG)) {
+ // entryname dataclass; [circular] molecule; division; sequencelength BP.
+ String loc = ((String[])section.get(0))[1];
+ Matcher m = lp.matcher(loc);
+ Matcher mPre87 = lpPre87.matcher(loc);
+ if (m.matches()) {
+ // first token is both name and primary accession
+ rlistener.setName(m.group(1));
+ rlistener.setAccession(m.group(1));
+ // second token is version
+ rlistener.setVersion(Integer.parseInt(m.group(2)));
+ // third token is circular/linear
+ rlistener.setCircular(m.group(3).equals("circular"));
+ // fourth token is moltype
+ rlistener.addSequenceProperty(Terms.getMolTypeTerm(),m.group(4));
+ // fifth token is data class
+ rlistener.addSequenceProperty(Terms.getDataClassTerm(),m.group(5));
+ // sixth token is taxonomic division
+ rlistener.setDivision(m.group(6));
+ // seventh token is sequence length, which is ignored
+ rlistener.addSequenceProperty(Terms.getLengthTerm(),m.group(7));
+ // as it is calculated from the sequence data later.
+ } else if (mPre87.matches()) {
+ rlistener.setName(mPre87.group(1));
+ if (mPre87.group(3)!=null) {
+ // add annotation for 'genomic' (Ensembl-specific term)
+ rlistener.addSequenceProperty(Terms.getGenomicTerm(),null);
+ }
+ rlistener.addSequenceProperty(Terms.getMolTypeTerm(),mPre87.group(4));
+ rlistener.setDivision(mPre87.group(5));
+ rlistener.addSequenceProperty(Terms.getLengthTerm(), mPre87.group(6));
+ // Optional extras
+ String circular = mPre87.group(2);
+ if (circular!=null) rlistener.setCircular(true);
+ } else {
+ String message = ParseException.newMessage(this.getClass(),accession,"Not Set","Bad ID line found", sectionToString(section));
+ throw new ParseException(message);
+ }
+ } else if (sectionKey.equals(DEFINITION_TAG)) {
+ rlistener.setDescription(((String[])section.get(0))[1]);
+ } else if (sectionKey.equals(SOURCE_TAG)) {
+ // only interested in organelle sub-tag
+ for (int i = 1; i < section.size(); i++) {
+ sectionKey = ((String[])section.get(i))[0];
+ if (sectionKey.equals(ORGANELLE_TAG)) {
+ rlistener.addSequenceProperty(Terms.getOrganelleTerm(), ((String[])section.get(i))[1].trim());
+ break; // skip out of for loop once found
+ }
+ if(sectionKey.equals(ORGANISM_TAG)){
+ rlistener.addSequenceProperty(Terms.getOrganismTerm(), ((String[])section.get(i))[1].trim());
+ break;
+ }
+ }
+ } else if (sectionKey.equals(DATE_TAG)) {
+ String chunk = ((String[])section.get(0))[1].trim();
+ Matcher dm = dp.matcher(chunk);
+ Matcher dm_ensembl = dp_ensembl.matcher(chunk);
+ if(dm_ensembl.matches()){
+ String date = dm_ensembl.group(1);
+ rlistener.addSequenceProperty(Terms.getDateTerm(),date);
+ }else if (dm.matches()) {
+ String date = dm.group(1);
+ String rel = dm.group(3);
+ String type = dm.group(4);
+ if (type.equals("Created")) {
+ rlistener.addSequenceProperty(Terms.getDateCreatedTerm(), date);
+ rlistener.addSequenceProperty(Terms.getRelCreatedTerm(), rel);
+ } else if (type.equals("Last updated, Version ")) {
+ rlistener.addSequenceProperty(Terms.getDateUpdatedTerm(), date);
+ rlistener.addSequenceProperty(Terms.getRelUpdatedTerm(), rel);
+ rlistener.addSequenceProperty(Terms.getRelUpdatedRecordVersionTerm(), dm.group(5));
+ } else {
+ String message = ParseException.newMessage(this.getClass(),accession,"not set", "Bad date type found",sectionToString(section));
+ throw new ParseException(message);
+ }
+ } else {
+ String message = ParseException.newMessage(this.getClass(),accession,"not set", "Bad date line found",sectionToString(section));
+ throw new ParseException(message);
+
+ }
+ } else if (sectionKey.equals(ACCESSION_TAG)) {
+ // if multiple accessions, store only first as accession,
+ // and store rest in annotation
+ String[] accs = ((String[])section.get(0))[1].split(";");
+ accession = accs[0].trim();
+ rlistener.setAccession(accession);
+ for (int i = 1; i < accs.length; i++) {
+ rlistener.addSequenceProperty(Terms.getAdditionalAccessionTerm(),accs[i].trim());
+ }
+ } else if (sectionKey.equals(VERSION_TAG)) {
+ String ver = ((String[])section.get(0))[1];
+ /*Matcher m = vp.matcher(ver);
+ if (m.matches()) {
+ String verAcc = m.group(1);
+ if (!accession.equals(verAcc)) {
+ // the version refers to a different accession!
+ // believe the version line, and store the original
+ // accession away in the additional accession set
+ rlistener.addSequenceProperty(Terms.getAdditionalAccessionTerm(),accession);
+ accession = verAcc;
+ rlistener.setAccession(accession);
+ }
+ rlistener.setVersion(Integer.parseInt(m.group(2)));
+ } else {*/
+ rlistener.addSequenceProperty(Terms.getVersionLineTerm(),ver);
+ //}
+ } else if (sectionKey.equals(KEYWORDS_TAG)) {
+ String val = ((String[])section.get(0))[1];
+ val = val.substring(0,val.length()-1); // chomp dot
+ val = val.replace('\n',' '); //remove newline
+ String[] kws = val.split(";");
+ for (int i = 0; i < kws.length; i++) {
+ String kw = kws[i].trim();
+ if (kw.length()==0) continue;
+ rlistener.addSequenceProperty(Terms.getKeywordTerm(), kw);
+ }
+ } else if (sectionKey.equals(DATABASE_XREF_TAG)) {
+ String val = ((String[])section.get(0))[1];
+ val = val.substring(0,val.length()-1); // chomp dot
+ // database_identifier; primary_identifier; secondary_identifier....
+ String[] parts = val.split(";");
+ // construct a DBXREF out of the dbname part[0] and accession part[1]
+ CrossRef crossRef = (CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{parts[0].trim(),parts[1].trim(), new Integer(0)});
+ // assign remaining bits of info as annotations
+ for (int j = 2; j < parts.length; j++) {
+ Note note = new SimpleNote(Terms.getAdditionalAccessionTerm(),parts[j].trim(),j-1);
+ try {
+ crossRef.getRichAnnotation().addNote(note);
+ } catch (ChangeVetoException ce) {
+ String message = ParseException.newMessage(this.getClass(),accession,"not set", "Could not annotate identifier terms",sectionToString(section));
+ ParseException pe = new ParseException(message);
+ pe.initCause(ce);
+ throw pe;
+ }
+ }
+ RankedCrossRef rcrossRef = new SimpleRankedCrossRef(crossRef, 0);
+ rlistener.setRankedCrossRef(rcrossRef);
+ } else if (sectionKey.equals(REFERENCE_TAG) && !this.getElideReferences()) {
+ // first line of section has rank and location
+ String refrank = ((String[])section.get(0))[1];
+ int ref_rank = Integer.parseInt(refrank.substring(1,refrank.length()-1));
+ int ref_start = -999;
+ int ref_end = -999;
+ // rest can be in any order
+ String consortium = null;
+ String authors = "";
+ String title = null;
+ String locator = null;
+ String pubmed = null;
+ String medline = null;
+ String doi = null;
+ String remark = null;
+ for (int i = 1; i < section.size(); i++) {
+ String key = ((String[])section.get(i))[0];
+ String val = ((String[])section.get(i))[1];
+ if (key.equals(AUTHORS_TAG)) {
+ if (val.endsWith(";")) val = val.substring(0,val.length()-1); // chomp semicolon
+ authors = val.replace('\n',' '); //see #2276
+ }
+ if (key.equals(CONSORTIUM_TAG)) {
+ if (val.endsWith(";")) val = val.substring(0,val.length()-1); // chomp semicolon
+ consortium = val.replace('\n',' '); //see #2276
+ }
+ if (key.equals(TITLE_TAG)) {
+ if (val.length()>1) {
+ if (val.endsWith(";")) val = val.substring(0,val.length()-1); // chomp semicolon
+ if (val.endsWith("\"")) val = val.substring(1,val.length()-1); // chomp quotes
+ title = val.replace('\n',' '); //see #2276
+ } else title=null; // single semi-colon indicates no title
+ }
+ if (key.equals(LOCATOR_TAG)) {
+ if (val.endsWith(".")) val = val.substring(0,val.length()-1); // chomp dot
+ locator = val.replace('\n',' '); //see #2276
+ }
+ if (key.equals(REFERENCE_XREF_TAG)) {
+ // database_identifier; primary_identifier.
+ String[] refs = val.split("\\.(\\s+|$)");
+ for (int j = 0 ; j < refs.length; j++) {
+ if (refs[j].trim().length()==0) continue;
+ String[] parts = refs[j].split(";");
+ String db = parts[0];
+ String ref = parts[1].trim();
+ if (db.equalsIgnoreCase(Terms.PUBMED_KEY)) pubmed = ref;
+ else if (db.equalsIgnoreCase(Terms.MEDLINE_KEY)) medline = ref;
+ else if (db.equalsIgnoreCase(Terms.DOI_KEY)) doi = ref;
+ }
+ }
+ if (key.equals(REMARK_TAG)) remark = val.replace('\n',' '); //see #2276
+ if (key.equals(REFERENCE_POSITION_TAG)) {
+ // only the first group is taken
+ // if we have multiple lines, only the last line is taken
+ Matcher m = rpp.matcher(val);
+ if (m.matches()) {
+ ref_start = Integer.parseInt(m.group(1));
+ if(m.group(2) != null)
+ ref_end = Integer.parseInt(m.group(3));
+ } else {
+ String message = ParseException.newMessage(this.getClass(),accession,"not set", "Bad reference line found",sectionToString(section));
+ throw new ParseException(message);
+ }
+ }
+ }
+ // create the docref object
+ try {
+ List authSet = DocRefAuthor.Tools.parseAuthorString(authors);
+ if (consortium!=null) authSet.add(new SimpleDocRefAuthor(consortium, true, false));
+ DocRef dr = (DocRef)RichObjectFactory.getObject(SimpleDocRef.class,new Object[]{authSet,locator,title});
+ // assign either the pubmed or medline to the docref - medline gets priority, then pubmed, then doi
+ if (medline!=null) dr.setCrossref((CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{Terms.MEDLINE_KEY, medline, new Integer(0)}));
+ else if (pubmed!=null) dr.setCrossref((CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{Terms.PUBMED_KEY, pubmed, new Integer(0)}));
+ else if (doi!=null) dr.setCrossref((CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{Terms.DOI_KEY, doi, new Integer(0)}));
+ // assign the remarks
+ if (!this.getElideComments()) dr.setRemark(remark);
+ // assign the docref to the bioentry
+ RankedDocRef rdr = new SimpleRankedDocRef(dr,
+ (ref_start != -999 ? new Integer(ref_start) : null),
+ (ref_end != -999 ? new Integer(ref_end) : null),
+ ref_rank);
+ rlistener.setRankedDocRef(rdr);
+ rlistener.setRankedDocRef(rdr);
+ } catch (ChangeVetoException e) {
+ String message = ParseException.newMessage(this.getClass(),accession,"not set", "",sectionToString(section));
+ throw new ParseException(e, message);
+ }
+ } else if (sectionKey.equals(COMMENT_TAG) && !this.getElideComments()) {
+ // Set up some comments
+ rlistener.setComment(((String[])section.get(0))[1]);
+ } else if (sectionKey.equals(FEATURE_TAG) && !this.getElideFeatures()) {
+ // starting from second line of input, start a new feature whenever we come across
+ // a key that does not start with /
+ boolean seenAFeature = false;
+ int rcrossrefCount = 0;
+ for (int i = 1 ; i < section.size(); i++) {
+ String key = ((String[])section.get(i))[0];
+ String val = ((String[])section.get(i))[1];
+ if (key.startsWith("/")) {
+ key = key.substring(1); // strip leading slash
+ val = val.replaceAll("\\s*[\\n\\r]+\\s*","").trim();
+ if (val.startsWith("\"")) val = val.substring(1,val.length()-1).trim(); // strip quotes
+ // parameter on old feature
+ if (key.equalsIgnoreCase("db_xref")) {
+ Matcher m = dbxp.matcher(val);
+ if (m.matches()) {
+ String dbname = m.group(1);
+ String raccession = m.group(2);
+ if (dbname.equalsIgnoreCase("taxon")) {
+ // Set the Taxon instead of a dbxref
+ tax = (NCBITaxon)RichObjectFactory.getObject(SimpleNCBITaxon.class, new Object[]{Integer.valueOf(raccession)});
+ rlistener.setTaxon(tax);
+ try {
+ if (organism!=null) tax.addName(NCBITaxon.SCIENTIFIC,organism);
+ } catch (ChangeVetoException e) {
+ String message = ParseException.newMessage(this.getClass(),accession,"not set", "",sectionToString(section));
+ throw new ParseException(e, message);
+ }
+ } else {
+ try {
+ CrossRef cr = (CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{dbname, raccession, new Integer(0)});
+ RankedCrossRef rcr = new SimpleRankedCrossRef(cr, ++rcrossrefCount);
+ rlistener.getCurrentFeature().addRankedCrossRef(rcr);
+ } catch (ChangeVetoException e) {
+ String message = ParseException.newMessage(this.getClass(),accession,"not set", "",sectionToString(section));
+ throw new ParseException(e, message);
+ }
+ }
+ } else {
+ String message = ParseException.newMessage(this.getClass(),accession,"not set", "Bad dbxref found",sectionToString(section));
+ throw new ParseException(message);
+ }
+ } else if (key.equalsIgnoreCase("organism")) {
+ try {
+ organism = val;
+ if (tax!=null) tax.addName(NCBITaxon.SCIENTIFIC,organism);
+ } catch (ChangeVetoException e) {
+ String message = ParseException.newMessage(this.getClass(),accession,"not set", "",sectionToString(section));
+ throw new ParseException(message);
+ }
+ } else {
+ if (key.equalsIgnoreCase("translation")) {
+ // strip spaces from sequence
+ val = val.replaceAll("\\s+","");
+ }
+ rlistener.addFeatureProperty(RichObjectFactory.getDefaultOntology().getOrCreateTerm(key),val);
+ }
+ } else {
+ // new feature!
+ // end previous feature
+ if (seenAFeature) rlistener.endFeature();
+ // start next one, with lots of lovely info in it
+ RichFeature.Template templ = new RichFeature.Template();
+ templ.annotation = new SimpleRichAnnotation();
+ templ.sourceTerm = Terms.getEMBLTerm();
+ templ.typeTerm = RichObjectFactory.getDefaultOntology().getOrCreateTerm(key);
+ templ.featureRelationshipSet = new TreeSet();
+ templ.rankedCrossRefs = new TreeSet();
+ String tidyLocStr = val.replaceAll("\\s+","");
+ templ.location = GenbankLocationParser.parseLocation(ns, accession, tidyLocStr);
+ rlistener.startFeature(templ);
+ seenAFeature = true;
+ rcrossrefCount = 0;
+ }
+ }
+ if (seenAFeature) rlistener.endFeature();
+ } /*else if (sectionKey.equals(START_SEQUENCE_TAG) && !this.getElideSymbols()) {
+ StringBuffer seq = new StringBuffer();
+ for (int i = 0 ; i < section.size(); i++) seq.append(((String[])section.get(i))[1]);
+ try {
+ SymbolList sl = new SimpleSymbolList(symParser,
+ seq.toString().replaceAll("\\s+","").replaceAll("[\\.|~]","-"));
+ rlistener.addSymbols(symParser.getAlphabet(),
+ (Symbol[])(sl.toList().toArray(new Symbol[0])),
+ 0, sl.length());
+ } catch (Exception e) {
+ String message = ParseException.newMessage(this.getClass(),accession,"not set", "Bad sequence",sectionToString(section));
+ throw new ParseException(e, message);
+ }
+ }*/
+ } while (!sectionKey.equals(END_SEQUENCE_TAG));
+
+ // Allows us to tolerate trailing whitespace without
+ // thinking that there is another Sequence to follow
+ while (true) {
+ reader.mark(1);
+ int c = reader.read();
+ if (c == -1) {
+ hasAnotherSequence = false;
+ break;
+ }
+ if (Character.isWhitespace((char) c)) {
+ //hasInternalWhitespace = true;
+ continue;
+ }
+ //if (hasInternalWhitespace)
+ // System.err.println("Warning: whitespace found between sequence entries");
+ reader.reset();
+ break;
+ }
+
+ // Finish up.
+ rlistener.endSequence();
+ return hasAnotherSequence;
+ }
+
+ // reads an indented section, combining split lines and creating a list of key->value tuples
+ private List readSection(BufferedReader br) throws ParseException {
+ List section = new ArrayList();
+ String line;
+ boolean done = false;
+
+ // while not done
+ try {
+ while (!done) {
+ // mark buffer
+ br.mark(160);
+ // read token
+ line = br.readLine();
+ if (line.length()<2) {
+ String message = ParseException.newMessage(this.getClass(),accession,"not set", "Bad line found",line);
+ throw new ParseException(message);
+ }
+ String token = line.substring(0,2);
+ // READ SEQUENCE SECTION
+ if (token.equals(START_SEQUENCE_TAG)) {
+ // from next line, read sequence until // - leave // on stack
+ StringBuffer sb = new StringBuffer();
+ while (!done) {
+ br.mark(160);
+ line = br.readLine();
+ if (line.startsWith(END_SEQUENCE_TAG)) {
+ br.reset();
+ done = true;
+ } else {
+ // create sequence tag->value pair to return, sans numbers
+ sb.append(line.replaceAll("\\d",""));
+ }
+ }
+ section.add(new String[]{START_SEQUENCE_TAG,sb.toString()});
+ }
+ // READ FEATURE TABLE SECTION
+ else if (token.equals(FEATURE_HEADER_TAG)) {
+ // create dummy feature tag->value pair and add to return set
+ section.add(new String[]{FEATURE_TAG,null});
+ // drop next FH line
+ line = br.readLine(); // skip next line too - it is also FH
+ // read all FT lines until XX
+ String currentTag = null;
+ StringBuffer currentVal = null;
+ while (!done) {
+ line = br.readLine();
+ if (line.startsWith(DELIMITER_TAG)) {
+ done = true;
+ // dump current tag if exists
+ if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()});
+ } else {
+ // FT lines: FT word value
+ // or FT /word
+ // or FT /db_xref="taxon:3899....
+ // ......"
+ line = line.substring(5); // chomp off "FT "
+ if (!line.startsWith(" ")) {
+ // dump current tag if exists
+ if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()});
+ // case 1 : word value - splits into key-value on its own
+ String[] parts = line.trim().split("\\s+");
+ currentTag = parts[0];
+ currentVal = new StringBuffer();
+ currentVal.append(parts[1]);
+ } else {
+ line = line.trim();
+ if (line.startsWith("/")) {
+ // dump current tag if exists
+ if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()});
+ // case 2 : /word[=.....]
+ currentVal = new StringBuffer();
+ int equalIndex = line.indexOf('=');
+ if (equalIndex>=0) {
+ currentTag = line.substring(0, equalIndex);
+ currentVal.append(line.substring(equalIndex+1));
+ } else {
+ currentTag = line;
+ }
+ } else {
+ // case 3 : ...."
+ currentVal.append("\n");
+ currentVal.append(line);
+ }
+ }
+ }
+ }
+ }
+ // READ END OF SEQUENCE
+ else if (token.equals(END_SEQUENCE_TAG)) {
+ section.add(new String[]{END_SEQUENCE_TAG,null});
+ done = true;
+ }
+ // READ DELIMITER TAG
+ else if (token.equals(DELIMITER_TAG)) {
+ section.add(new String[]{DELIMITER_TAG,null});
+ done = true;
+ }
+ // READ THIRD PARTY ANNOTATION SECTION
+ else if (token.equals(TPA_TAG)) {
+ // exception = don't know how to do TPA yet
+ String message = ParseException.newMessage(this.getClass(),accession,"not set", "Unable to handle TPAs just yet",sectionToString(section));
+ throw new ParseException(message);
+ }
+ // READ CONTIG SECTION
+ //else if (token.equals(CONTIG_TAG)) {
+ // exception = don't know how to do contigs yet
+ //String message = ParseException.newMessage(this.getClass(),accession,"not set", "Unable to handle contig assemblies just yet",sectionToString(section));
+ //throw new ParseException(message);
+ //}
+ //2016.1.27 modified by Xiujuan for parsing file, file containing CONTIG_TAG
+ else if (token.equals(CONTIG_TAG)) {
+ section.add(new String[]{CONTIG_TAG,null});
+ done = true;
+ }
+ // READ DOCREF
+ else if (token.equals(DATABASE_XREF_TAG)) {
+ section.add(new String[]{DATABASE_XREF_TAG,line.substring(5).trim()});
+ done = true;
+ }
+ // READ DATE
+ else if (token.equals(DATE_TAG)) {
+ section.add(new String[]{DATE_TAG,line.substring(5).trim()});
+ done = true;
+ }
+ // READ NORMAL TAG/VALUE SECTION
+ else {
+ // rewind buffer to mark
+ br.reset();
+ // read token/values until XX
+ String currentTag = null;
+ StringBuffer currentVal = null;
+ while (!done) {
+ line = br.readLine();
+ if (line.startsWith(DELIMITER_TAG)) {
+ done = true;
+ // dump current tag if exists
+ if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()});
+ } else {
+ try {
+ // merge neighbouring repeated tokens by concatting values
+ // return tag->value pairs
+ String tag = line.substring(0,2);
+ String value = line.substring(5);
+ if (currentTag==null || !tag.equals(currentTag)) {
+ // dump current tag if exists
+ if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()});
+ // start new tag
+ currentTag = tag;
+ currentVal = new StringBuffer();
+ currentVal.append(value);
+ } else {
+ currentVal.append("\n");
+ currentVal.append(value);
+ }
+ } catch (Exception e) {
+ String message = ParseException.newMessage(this.getClass(), accession, "not set","",sectionToString(section));
+ throw new ParseException(e, message);
+ }
+ }
+ }
+ }
+ }
+ } catch (IOException e) {
+ String message = ParseException.newMessage(this.getClass(),accession,"not set", "Unable to handle TPAs just yet",sectionToString(section));
+ throw new ParseException(message);
+ }
+ return section;
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ public void writeSequence(Sequence seq, PrintStream os) throws IOException {
+ if (this.getPrintStream()==null) this.setPrintStream(os);
+ this.writeSequence(seq, RichObjectFactory.getDefaultNamespace());
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ public void writeSequence(Sequence seq, String format, PrintStream os) throws IOException {
+ if (this.getPrintStream()==null) this.setPrintStream(os);
+ this.writeSequence(seq, format, RichObjectFactory.getDefaultNamespace());
+ }
+
+ /**
+ * {@inheritDoc}
+ * Namespace is ignored as EMBL has no concept of it.
+ */
+ public void writeSequence(Sequence seq, Namespace ns) throws IOException {
+ this.writeSequence(seq, this.getDefaultFormat(), ns);
+ }
+
+ /**
+ * As per {@link #writeSequence(Sequence, Namespace)}, except
+ * that it also takes a format parameter. This can be any of the formats
+ * defined as constants in this class.
+ * @param seq see {@link #writeSequence(Sequence, Namespace)}
+ * @param format the format to use.
+ * @param ns see {@link #writeSequence(Sequence, Namespace)}
+ * @throws IOException see {@link #writeSequence(Sequence, Namespace)}
+ */
+ public void writeSequence(Sequence seq, String format, Namespace ns) throws IOException {
+ if (!format.equals(EMBL_FORMAT) && !format.equals(EMBL_PRE87_FORMAT))
+ throw new IllegalArgumentException("Format "+format+" not recognised.");
+
+ RichSequence rs;
+ try {
+ if (seq instanceof RichSequence) rs = (RichSequence)seq;
+ else rs = RichSequence.Tools.enrich(seq);
+ } catch (ChangeVetoException e) {
+ IOException e2 = new IOException("Unable to enrich sequence");
+ e2.initCause(e);
+ throw e2;
+ }
+
+ SymbolTokenization tok;
+ try {
+ tok = rs.getAlphabet().getTokenization("token");
+ } catch (Exception e) {
+ throw new RuntimeException("Unable to get alphabet tokenizer",e);
+ }
+
+ Set notes = rs.getNoteSet();
+ String accession = rs.getAccession();
+ StringBuffer accessions = new StringBuffer();
+ accessions.append(accession);
+ accessions.append(";");
+ String cdat = null;
+ String udat = null;
+ String crel = null;
+ String urel = null;
+ String urecv = null;
+ String organelle = null;
+ String versionLine = null;
+ String dataClass = "STD";
+ boolean genomic = false;
+ String moltype = rs.getAlphabet().getName();
+ for (Iterator i = notes.iterator(); i.hasNext(); ) {
+ Note n = i.next();
+ if (n.getTerm().equals(Terms.getDateCreatedTerm())) cdat=n.getValue();
+ else if (n.getTerm().equals(Terms.getDateUpdatedTerm())) udat=n.getValue();
+ else if (n.getTerm().equals(Terms.getRelCreatedTerm())) crel=n.getValue();
+ else if (n.getTerm().equals(Terms.getRelUpdatedTerm())) urel=n.getValue();
+ else if (n.getTerm().equals(Terms.getRelUpdatedRecordVersionTerm())) urecv=n.getValue();
+ else if (n.getTerm().equals(Terms.getMolTypeTerm())) moltype=n.getValue();
+ else if (n.getTerm().equals(Terms.getVersionLineTerm())) versionLine=n.getValue();
+ else if (n.getTerm().equals(Terms.getGenomicTerm())) genomic = true;
+ else if (n.getTerm().equals(Terms.getDataClassTerm())) dataClass = n.getValue();
+ else if (n.getTerm().equals(Terms.getAdditionalAccessionTerm())) {
+ accessions.append(" ");
+ accessions.append(n.getValue());
+ accessions.append(";");
+ } else if (n.getTerm().equals(Terms.getOrganelleTerm())) organelle=n.getValue();
+ }
+
+ StringBuffer locusLine = new StringBuffer();
+ // Division cannot be null
+ String div = rs.getDivision();
+ if(div==null || div.length()==0 || div.length()>3)
+ div = "UNC"; //Unclassified
+
+ if (format.equals(EMBL_FORMAT)) {
+ // accession; SV version; circular/linear; moltype; dataclass; division; length BP.
+ locusLine.append(rs.getAccession());
+ locusLine.append("; SV ");
+ locusLine.append(rs.getVersion());
+ locusLine.append("; ");
+ locusLine.append(rs.getCircular()?"circular":"linear");
+ locusLine.append("; ");
+ locusLine.append(moltype);
+ locusLine.append("; ");
+ locusLine.append(dataClass);
+ locusLine.append("; ");
+ locusLine.append(div);
+ locusLine.append("; ");
+ locusLine.append(rs.length());
+ locusLine.append(" BP.");
+ } else if (format.equals(EMBL_PRE87_FORMAT)) {
+ // entryname dataclass; [circular] molecule; division; sequencelength BP.
+ locusLine.append(StringTools.rightPad(rs.getName(), 9));
+ locusLine.append(" standard; ");
+ locusLine.append(rs.getCircular()?"circular ":"");
+ // if it is Ensembl genomic, add that in too
+ if (genomic==true) locusLine.append("genomic ");
+ locusLine.append(moltype);
+ locusLine.append("; ");
+ locusLine.append(div);
+ locusLine.append("; ");
+ locusLine.append(rs.length());
+ locusLine.append(" BP.");
+ }
+ StringTools.writeKeyValueLine(LOCUS_TAG, locusLine.toString(), 5, this.getLineWidth(), null, LOCUS_TAG, this.getPrintStream());
+ this.getPrintStream().println(DELIMITER_TAG+" ");
+
+ // accession line
+ StringTools.writeKeyValueLine(ACCESSION_TAG, accessions.toString(), 5, this.getLineWidth(), null, ACCESSION_TAG, this.getPrintStream());
+ this.getPrintStream().println(DELIMITER_TAG+" ");
+
+ // version line
+ if (format.equals(EMBL_PRE87_FORMAT)) {
+ if (versionLine!=null) StringTools.writeKeyValueLine(VERSION_TAG, versionLine, 5, this.getLineWidth(), null, VERSION_TAG, this.getPrintStream());
+ else StringTools.writeKeyValueLine(VERSION_TAG, accession+"."+rs.getVersion(), 5, this.getLineWidth(), null, VERSION_TAG, this.getPrintStream());
+ this.getPrintStream().println(DELIMITER_TAG+" ");
+ }
+
+ // date line
+ StringTools.writeKeyValueLine(DATE_TAG, (cdat==null?udat:cdat)+" (Rel. "+(crel==null?"0":crel)+", Created)", 5, this.getLineWidth(), null, DATE_TAG, this.getPrintStream());
+ StringTools.writeKeyValueLine(DATE_TAG, udat+" (Rel. "+(urel==null?"0":urel)+", Last updated, Version "+(urecv==null?"0":urecv)+")", 5, this.getLineWidth(), null, DATE_TAG, this.getPrintStream());
+ this.getPrintStream().println(DELIMITER_TAG+" ");
+
+ // definition line
+ StringTools.writeKeyValueLine(DEFINITION_TAG, rs.getDescription(), 5, this.getLineWidth(), null, DEFINITION_TAG, this.getPrintStream());
+ this.getPrintStream().println(DELIMITER_TAG+" ");
+
+ // keywords line
+ StringBuffer keywords = new StringBuffer();
+ for (Iterator n = notes.iterator(); n.hasNext(); ) {
+ Note nt = n.next();
+ if (nt.getTerm().equals(Terms.getKeywordTerm())) {
+ if (keywords.length()>0) keywords.append("; ");
+ keywords.append(nt.getValue());
+ }
+ }
+ if (keywords.length()>0) {
+ keywords.append(".");
+ StringTools.writeKeyValueLine(KEYWORDS_TAG, keywords.toString(), 5, this.getLineWidth(), null, KEYWORDS_TAG, this.getPrintStream());
+ this.getPrintStream().println(DELIMITER_TAG+" ");
+ } else {
+ this.getPrintStream().println(KEYWORDS_TAG+" .");
+ this.getPrintStream().println(DELIMITER_TAG+" ");
+ }
+
+ // source line (from taxon)
+ // organism line
+ NCBITaxon tax = rs.getTaxon();
+ if (tax!=null) {
+ StringTools.writeKeyValueLine(SOURCE_TAG, tax.getDisplayName(), 5, this.getLineWidth(), null, SOURCE_TAG, this.getPrintStream());
+ StringTools.writeKeyValueLine(ORGANISM_TAG, tax.getNameHierarchy(), 5, this.getLineWidth(), null, SOURCE_TAG, this.getPrintStream());
+ if (organelle!=null) StringTools.writeKeyValueLine(ORGANELLE_TAG, organelle, 5, this.getLineWidth(), null, ORGANELLE_TAG, this.getPrintStream());
+ this.getPrintStream().println(DELIMITER_TAG+" ");
+ }
+
+ // references - rank (bases x to y)
+ for (Iterator r = rs.getRankedDocRefs().iterator(); r.hasNext(); ) {
+ RankedDocRef rdr = r.next();
+ DocRef d = rdr.getDocumentReference();
+ // RN, RC, RP, RX, RG, RA, RT, RL
+ StringTools.writeKeyValueLine(REFERENCE_TAG, "["+rdr.getRank()+"]", 5, this.getLineWidth(), null, REFERENCE_TAG, this.getPrintStream());
+ StringTools.writeKeyValueLine(REMARK_TAG, d.getRemark(), 5, this.getLineWidth(), null, REMARK_TAG, this.getPrintStream());
+ Integer rstart = rdr.getStart();
+ if (rstart==null) rstart = new Integer(1);
+ Integer rend = rdr.getEnd();
+ if (rend==null) rend = new Integer(rs.length());
+ StringTools.writeKeyValueLine(REFERENCE_POSITION_TAG, rstart+"-"+rend, 5, this.getLineWidth(), null, REFERENCE_POSITION_TAG, this.getPrintStream());
+ CrossRef c = d.getCrossref();
+ if (c!=null) StringTools.writeKeyValueLine(REFERENCE_XREF_TAG, c.getDbname()+"; "+c.getAccession()+".", 5, this.getLineWidth(), null, REFERENCE_XREF_TAG, this.getPrintStream());
+ List auths = d.getAuthorList();
+ for (Iterator j = auths.iterator(); j.hasNext(); ) {
+ DocRefAuthor a = j.next();
+ if (a.isConsortium()) {
+ StringTools.writeKeyValueLine(CONSORTIUM_TAG, a+";", 5, this.getLineWidth(), null, CONSORTIUM_TAG, this.getPrintStream());
+ j.remove();
+ }
+ }
+ if (!auths.isEmpty()) StringTools.writeKeyValueLine(AUTHORS_TAG, DocRefAuthor.Tools.generateAuthorString(auths, true)+";", 5, this.getLineWidth(), null, AUTHORS_TAG, this.getPrintStream());
+ else StringTools.writeKeyValueLine(AUTHORS_TAG, ";", 5, this.getLineWidth(), null, AUTHORS_TAG, this.getPrintStream());
+ if (d.getTitle()!=null && d.getTitle().length()!=0) StringTools.writeKeyValueLine(TITLE_TAG, "\""+d.getTitle()+"\";", 5, this.getLineWidth(), null, TITLE_TAG, this.getPrintStream());
+ else StringTools.writeKeyValueLine(TITLE_TAG, ";", 5, this.getLineWidth(), null, TITLE_TAG, this.getPrintStream());
+ StringTools.writeKeyValueLine(LOCATOR_TAG, d.getLocation()+".", 5, this.getLineWidth(), null, LOCATOR_TAG, this.getPrintStream());
+ this.getPrintStream().println(DELIMITER_TAG+" ");
+ }
+
+ // db references - ranked
+ for (Iterator r = rs.getRankedCrossRefs().iterator(); r.hasNext(); ) {
+ RankedCrossRef rcr = r.next();
+ CrossRef c = rcr.getCrossRef();
+ Set noteset = c.getNoteSet();
+ StringBuffer sb = new StringBuffer();
+ sb.append(c.getDbname());
+ sb.append("; ");
+ sb.append(c.getAccession());
+ boolean hasSecondary = false;
+ for (Iterator i = noteset.iterator(); i.hasNext(); ) {
+ Note n = i.next();
+ if (n.getTerm().equals(Terms.getAdditionalAccessionTerm())) {
+ sb.append("; ");
+ sb.append(n.getValue());
+ hasSecondary = true;
+ }
+ }
+ //if (!hasSecondary) sb.append("; -");
+ //sb.append(".");
+ if (!hasSecondary) sb.append(";");
+ else sb.append(".");
+ StringTools.writeKeyValueLine(DATABASE_XREF_TAG, sb.toString(), 5, this.getLineWidth(), null, DATABASE_XREF_TAG, this.getPrintStream());
+ }
+ if (!rs.getRankedCrossRefs().isEmpty())
+ this.getPrintStream().println(DELIMITER_TAG+" ");
+
+ // comments - if any
+ if (!rs.getComments().isEmpty()) {
+ StringBuffer sb = new StringBuffer();
+ for (Iterator i = rs.getComments().iterator(); i.hasNext(); ) {
+ Comment c = i.next();
+ sb.append(c.getComment());
+ if (i.hasNext()) sb.append("\n");
+ }
+ StringTools.writeKeyValueLine(COMMENT_TAG, sb.toString(), 5, this.getLineWidth(), null, COMMENT_TAG, this.getPrintStream());
+ this.getPrintStream().println(DELIMITER_TAG+" ");
+ }
+
+ this.getPrintStream().println(FEATURE_HEADER_TAG+" Key Location/Qualifiers");
+ this.getPrintStream().println(FEATURE_HEADER_TAG+" ");
+ // feature_type location
+ for (Iterator i = rs.getFeatureSet().iterator(); i.hasNext(); ) {
+ RichFeature f = (RichFeature)i.next();
+ StringTools.writeKeyValueLine(FEATURE_TAG+" "+f.getTypeTerm().getName(), GenbankLocationParser.writeLocation((RichLocation)f.getLocation()), 21, this.getLineWidth(), ",", FEATURE_TAG, this.getPrintStream());
+ for (Iterator j = f.getNoteSet().iterator(); j.hasNext(); ) {
+ Note n = j.next();
+ // /key="val" or just /key if val==""
+ if (n.getValue()==null || n.getValue().length()==0) StringTools.writeKeyValueLine(FEATURE_TAG, "/"+n.getTerm().getName(), 21, this.getLineWidth(), null, FEATURE_TAG, this.getPrintStream());
+ else StringTools.writeKeyValueLine(FEATURE_TAG, "/"+n.getTerm().getName()+"=\""+n.getValue()+"\"", 21, this.getLineWidth(), null, FEATURE_TAG, this.getPrintStream());
+ }
+ // add-in to source feature only organism and db_xref="taxon:xyz" where present
+ if (f.getType().equals("source") && tax!=null) {
+ String displayName = tax.getDisplayName();
+ if (displayName.indexOf('(')>-1) displayName = displayName.substring(0, displayName.indexOf('(')).trim();
+ StringTools.writeKeyValueLine(FEATURE_TAG, "/organism=\""+displayName+"\"", 21, this.getLineWidth(), null, FEATURE_TAG, this.getPrintStream());
+ StringTools.writeKeyValueLine(FEATURE_TAG, "/db_xref=\"taxon:"+tax.getNCBITaxID()+"\"", 21, this.getLineWidth(), null, FEATURE_TAG, this.getPrintStream());
+ }
+ // add-in other dbxrefs where present
+ for (Iterator j = f.getRankedCrossRefs().iterator(); j.hasNext(); ) {
+ RankedCrossRef rcr = j.next();
+ CrossRef cr = rcr.getCrossRef();
+ StringTools.writeKeyValueLine(FEATURE_TAG, "/db_xref=\""+cr.getDbname()+":"+cr.getAccession()+"\"", 21, this.getLineWidth(), null, FEATURE_TAG, this.getPrintStream());
+ }
+ }
+ this.getPrintStream().println(DELIMITER_TAG+" ");
+
+ // SQ Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other;
+ int aCount = 0;
+ int cCount = 0;
+ int gCount = 0;
+ int tCount = 0;
+ int oCount = 0;
+ for (int i = 1; i <= rs.length(); i++) {
+ char c;
+ try {
+ c = tok.tokenizeSymbol(rs.symbolAt(i)).charAt(0);
+ } catch (Exception e) {
+ throw new RuntimeException("Unable to get symbol at position "+i,e);
+ }
+ switch (c) {
+ case 'a': case 'A':
+ aCount++;
+ break;
+ case 'c': case 'C':
+ cCount++;
+ break;
+ case 'g': case 'G':
+ gCount++;
+ break;
+ case 't': case 'T':
+ tCount++;
+ break;
+ default:
+ oCount++;
+ }
+ }
+ this.getPrintStream().print(START_SEQUENCE_TAG+" Sequence "+rs.length()+" BP; ");
+ this.getPrintStream().print(aCount + " A; ");
+ this.getPrintStream().print(cCount + " C; ");
+ this.getPrintStream().print(gCount + " G; ");
+ this.getPrintStream().print(tCount + " T; ");
+ this.getPrintStream().println(oCount + " other;");
+
+ // sequence stuff
+ Symbol[] syms = (Symbol[])rs.toList().toArray(new Symbol[0]);
+ int lineLen = 0;
+ int symCount = 0;
+ this.getPrintStream().print(" ");
+ for (int i = 0; i < syms.length; i++) {
+ if (symCount % 60 == 0 && symCount>0) {
+ this.getPrintStream().print(StringTools.leftPad(""+symCount,10));
+ this.getPrintStream().print("\n ");
+ lineLen = 0;
+ }
+ if (symCount % 10 == 0) {
+ this.getPrintStream().print(" ");
+ lineLen++;
+ }
+ try {
+ this.getPrintStream().print(tok.tokenizeSymbol(syms[i]));
+ } catch (IllegalSymbolException e) {
+ throw new RuntimeException("Found illegal symbol: "+syms[i]);
+ }
+ symCount++;
+ lineLen++;
+ }
+ this.getPrintStream().print(StringTools.leftPad(""+symCount,(66-lineLen)+10));
+ this.getPrintStream().print("\n");
+ this.getPrintStream().println(END_SEQUENCE_TAG);
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ public String getDefaultFormat() {
+ return EMBL_FORMAT;
+ }
+
+
+ /**
+ * Converts the current parse section to a String. Useful for debugging.
+ */
+ String sectionToString(List section){
+ StringBuffer parseBlock = new StringBuffer();
+ for(Iterator i = section.listIterator(); i.hasNext();){
+ String[] part = (String[])i.next();
+ for(int x = 0; x < part.length; x++){
+ parseBlock.append(part[x]);
+ if(x == 0){
+ parseBlock.append(" "); //the gap will have been trimmed
+ }
+ }
+ }
+ return parseBlock.toString();
+ }
+}
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/util/CustomIOTools.java b/piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/util/CustomIOTools.java
index afed93a..0e1c65b 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/util/CustomIOTools.java
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/util/CustomIOTools.java
@@ -1,6 +1,5 @@
package cn.piflow.bundle.microorganism.util;
-
import org.biojava.bio.BioError;
import org.biojava.bio.BioException;
import org.biojava.bio.seq.*;
@@ -662,10 +661,19 @@ public interface CustomIOTools {
* @return a RichSequenceIterator
over each sequence in the
* fasta file
*/
+ public static RichSequenceIterator readEMBLDNA(BufferedReader br,
+ Namespace ns) {
+ return new RichStreamReader(br, new CustomEMBLFormat(), getDNAParser(),
+ factory, ns);
+ }
-
-
+ //parse Ensembl file
+ public static RichSequenceIterator readEnsembl(BufferedReader br,
+ Namespace ns) {
+ return new RichStreamReader(br, new CustomEnsemblFormat(), getDNAParser(),
+ factory, ns);
+ }
/**
* Iterate over the sequences in an EMBL-format stream of RNA sequences.
@@ -753,7 +761,11 @@ public interface CustomIOTools {
* @return a RichSequenceIterator
over each sequence in the
* fasta file
*/
-
+ public static RichSequenceIterator readUniProt(BufferedReader br,
+ Namespace ns) {
+ return new RichStreamReader(br, new CustomUniProtFormat(),
+ getProteinParser(), factory, ns);
+ }
/**
* Read a UniProt XML file using a custom type of SymbolList. For
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/util/CustomUniProtFormat.java b/piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/util/CustomUniProtFormat.java
new file mode 100644
index 0000000..5478a5e
--- /dev/null
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/util/CustomUniProtFormat.java
@@ -0,0 +1,1291 @@
+package cn.piflow.bundle.microorganism.util;
+
+import org.biojava.bio.proteomics.MassCalc;
+import org.biojava.bio.seq.Sequence;
+import org.biojava.bio.seq.io.ParseException;
+import org.biojava.bio.seq.io.SeqIOListener;
+import org.biojava.bio.seq.io.SymbolTokenization;
+import org.biojava.bio.symbol.*;
+import org.biojava.ontology.Term;
+import org.biojava.utils.ChangeVetoException;
+import org.biojavax.*;
+import org.biojavax.bio.seq.RichFeature;
+import org.biojavax.bio.seq.RichLocation;
+import org.biojavax.bio.seq.RichSequence;
+import org.biojavax.bio.seq.io.RichSeqIOListener;
+import org.biojavax.bio.seq.io.RichSequenceFormat;
+import org.biojavax.bio.seq.io.UniProtCommentParser;
+import org.biojavax.bio.seq.io.UniProtLocationParser;
+import org.biojavax.bio.taxa.NCBITaxon;
+import org.biojavax.bio.taxa.SimpleNCBITaxon;
+import org.biojavax.ontology.ComparableTerm;
+import org.biojavax.utils.CRC64Checksum;
+import org.biojavax.utils.StringTools;
+
+import java.io.*;
+import java.util.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Created by xiujuan on 2016/5/11.
+ */
+public class CustomUniProtFormat extends RichSequenceFormat.HeaderlessFormat{
+
+
+ // Register this format with the format auto-guesser.
+ static {
+ RichSequence.IOTools.registerFormat(CustomUniProtFormat.class);
+ }
+
+ /**
+ * The name of this format
+ */
+ public static final String UNIPROT_FORMAT = "UniProt";
+
+ private static final String SUBFORMAT_UNIPROT = "UniProt";
+ private static final String SUBFORMAT_IPI = "IPI";
+
+ protected static final String LOCUS_TAG = "ID";
+ protected static final String ACCESSION_TAG = "AC";
+ protected static final String DEFINITION_TAG = "DE";
+ protected static final String DATE_TAG = "DT";
+ protected static final String SOURCE_TAG = "OS";
+ protected static final String ORGANELLE_TAG = "OG";
+ protected static final String ORGANISM_TAG = "OC";
+ protected static final String TAXON_TAG = "OX";
+ protected static final String GENE_TAG = "GN";
+ protected static final String DATABASE_XREF_TAG = "DR";
+ protected static final String PROTEIN_EXIST_TAG = "PE";
+ protected static final String REFERENCE_TAG = "RN";
+ protected static final String RP_LINE_TAG = "RP";
+ protected static final String REFERENCE_XREF_TAG = "RX";
+ protected static final String AUTHORS_TAG = "RA";
+ protected static final String CONSORTIUM_TAG = "RG";
+ protected static final String TITLE_TAG = "RT";
+ protected static final String LOCATION_TAG = "RL";
+ protected static final String RC_LINE_TAG = "RC";
+ protected static final String KEYWORDS_TAG = "KW";
+ protected static final String COMMENT_TAG = "CC";
+ protected static final String FEATURE_TAG = "FT";
+ protected static final String START_SEQUENCE_TAG = "SQ";
+ protected static final String END_SEQUENCE_TAG = "//";
+ protected static final String ORGANISM_HOST_TAG = "OH";
+
+ // locus line for uniprot format
+ protected static final Pattern lp_uniprot = Pattern.compile("^((\\S+)_(\\S+))\\s+(\\S+);\\s+(PRT)?;?\\s*\\d+\\s+AA\\.$");
+ // locus line for IPI format
+ protected static final Pattern lp_ipi = Pattern.compile("^((\\S+)\\.(\\d+))\\s+(IPI);\\s+(PRT)?;?\\s*\\d+\\s+AA\\.$");
+ // RP line parser
+ protected static final Pattern rppat = Pattern.compile("SEQUENCE OF (\\d+)-(\\d+)");
+ // date lineDT for uniprot
+ // date, integrated into UniProtKB/database_name.
+ // date, sequence version x.
+ // date, entry version x.
+ protected static final Pattern dp_uniprot = Pattern.compile("([^,]+),([^\\d\\.]+)(\\d+)?\\.$");
+ // date lineDT for IPI
+ // date (xxx, Created)
+ // date (xxx, Last sequence update)
+ protected static final Pattern dp_ipi = Pattern.compile("([^\\(]+)\\(([^,]+),([^\\)]+)\\)$");
+ // feature line
+ protected static final Pattern fp = Pattern.compile("^\\s*([\\d?<]+\\s+[\\d?>]+)(\\s+(.*))?$");
+
+ protected static final Pattern headerLine = Pattern.compile("^ID.*");
+
+ /**
+ * Implements some UniProt-specific terms.
+ */
+ public static class Terms extends RichSequence.Terms {
+ private static String GENENAME_KEY = "Name";
+ private static String GENESYNONYM_KEY = "Synonyms";
+ private static String ORDLOCNAME_KEY = "OrderedLocusNames";
+ private static String ORFNAME_KEY = "ORFNames";
+
+ /**
+ * Getter for the UniProt term
+ * @return The UniProt Term
+ */
+ public static ComparableTerm getUniProtTerm() {
+ return RichObjectFactory.getDefaultOntology().getOrCreateTerm("UniProt");
+ }
+
+ /**
+ * Getter for the UniProt combined database term
+ * @return The combined database for UniProt Term
+ */
+ public static ComparableTerm getUniProtDBNameTerm() {
+ return RichObjectFactory.getDefaultOntology().getOrCreateTerm("UniProt database name");
+ }
+
+ /**
+ * Getter for the protein exists term
+ * @return The protein exists Term
+ */
+ public static ComparableTerm getProteinExistsTerm() {
+ return RichObjectFactory.getDefaultOntology().getOrCreateTerm("UniProt protein exists");
+ }
+
+ public static ComparableTerm getOrganismHostTerm(){
+ return RichObjectFactory.getDefaultOntology().getOrCreateTerm("Organism host");
+ }
+
+ public static ComparableTerm getSequenceMetaInfoTerm(){
+ return RichObjectFactory.getDefaultOntology().getOrCreateTerm("Sequence meta info");
+ }
+ }
+
+ /**
+ * {@inheritDoc}
+ * A file is in UniProt format if the first line matches the UniProt format for the ID line.
+ */
+ public boolean canRead(File file) throws IOException {
+ BufferedReader br = new BufferedReader(new FileReader(file));
+ String firstLine = br.readLine();
+ boolean readable = firstLine!=null && headerLine.matcher(firstLine).matches() &&
+ (lp_uniprot.matcher(firstLine.substring(3).trim()).matches() ||
+ lp_ipi.matcher(firstLine.substring(3).trim()).matches());
+ br.close();
+ return readable;
+ }
+
+ /**
+ * {@inheritDoc}
+ * Always returns a protein tokenizer.
+ */
+ public SymbolTokenization guessSymbolTokenization(File file) throws IOException {
+ return RichSequence.IOTools.getProteinParser();
+ }
+
+ /**
+ * {@inheritDoc}
+ * A stream is in UniProt format if the first line matches the UniProt format for the ID line.
+ */
+ public boolean canRead(BufferedInputStream stream) throws IOException {
+ stream.mark(2000); // some streams may not support this
+ BufferedReader br = new BufferedReader(new InputStreamReader(stream));
+ String firstLine = br.readLine();
+ boolean readable = firstLine!=null && headerLine.matcher(firstLine).matches() &&
+ (lp_uniprot.matcher(firstLine.substring(3).trim()).matches()
+ || lp_ipi.matcher(firstLine.substring(3).trim()).matches());
+ // don't close the reader as it'll close the stream too.
+ // br.close();
+ stream.reset();
+ return readable;
+ }
+
+ /**
+ * {@inheritDoc}
+ * Always returns a protein tokenizer.
+ */
+ public SymbolTokenization guessSymbolTokenization(BufferedInputStream stream) throws IOException {
+ return RichSequence.IOTools.getProteinParser();
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ public boolean readSequence(BufferedReader reader,
+ SymbolTokenization symParser,
+ SeqIOListener listener)
+ throws IllegalSymbolException, IOException, ParseException {
+ if (!(listener instanceof RichSeqIOListener)) throw new IllegalArgumentException("Only accepting RichSeqIOListeners today");
+ return this.readRichSequence(reader,symParser,(RichSeqIOListener)listener,null);
+ }
+
+ private String accession = null;
+
+ /**
+ * {@inheritDoc}
+ */
+ public boolean readRichSequence(BufferedReader reader,
+ SymbolTokenization symParser,
+ RichSeqIOListener rlistener,
+ Namespace ns)
+ throws IllegalSymbolException, IOException, ParseException {
+
+ boolean hasAnotherSequence = true;
+ //boolean hasInternalWhitespace = false;
+
+ String subformat = SUBFORMAT_UNIPROT;
+
+ rlistener.startSequence();
+
+ if (ns==null) ns=RichObjectFactory.getDefaultNamespace();
+ rlistener.setNamespace(ns);
+
+ // Get an ordered list of key->value pairs in array-tuples
+ String sectionKey = null;
+ NCBITaxon tax = null;
+ accession = null;
+ List section = null;
+ try{
+ do {
+
+ section = this.readSection(reader);
+ sectionKey = ((String[])section.get(0))[0];
+ if(sectionKey == null){
+ String message = ParseException.newMessage(this.getClass(),accession, "", "Section key was null", sectionToString(section));
+ throw new ParseException(message);
+ }
+ // process section-by-section
+ if (sectionKey.equals(LOCUS_TAG)) {
+ // entryname dataclass; moltype; sequencelength AA.
+ String loc = ((String[])section.get(0))[1];
+ Matcher m = lp_uniprot.matcher(loc);
+ if (m.matches()) {
+ rlistener.setName(m.group(2));
+ rlistener.setDivision(m.group(3));
+ if (m.groupCount() > 4){
+ rlistener.addSequenceProperty(Terms.getDataClassTerm(),m.group(4));
+ rlistener.addSequenceProperty(Terms.getMolTypeTerm(),m.group(5));
+ }else{
+ rlistener.addSequenceProperty(Terms.getDataClassTerm(), m.group(4));
+ rlistener.addSequenceProperty(Terms.getMolTypeTerm(), "");
+ }
+ } else {
+ m = lp_ipi.matcher(loc);
+ if (m.matches()) {
+ subformat = SUBFORMAT_IPI;
+ rlistener.setName(m.group(2));
+ rlistener.setVersion(Integer.parseInt(m.group(3)));
+ rlistener.addSequenceProperty(Terms.getDataClassTerm(), m.group(4));
+ rlistener.addSequenceProperty(Terms.getMolTypeTerm(),m.group(5));
+ } else {
+ String message = ParseException.newMessage(this.getClass(),accession, "", "Bad ID line", sectionToString(section));
+ throw new ParseException(message);
+ }
+ }
+ } else if (sectionKey.equals(DEFINITION_TAG)) {
+ String val = ((String[])section.get(0))[1];
+ if (val.endsWith(".")) val = val.substring(0, val.length()-1); // chomp dot
+ rlistener.setDescription(val);
+ } else if (sectionKey.equals(SOURCE_TAG)) {
+ // use SOURCE_TAG and TAXON_TAG values
+ String sciname = null;
+ String comname = null;
+ List synonym = new ArrayList();
+ List lineage = new ArrayList();
+ int taxid = 0;
+ for (int i = 0; i < section.size(); i++) {
+ String tag = ((String[])section.get(i))[0];
+ String value = ((String[])section.get(i))[1].trim();
+ value = value.replace("\n", " ");
+ value = value.replace("\r\n", " ");
+
+ if (tag.equals(SOURCE_TAG)) {
+ if (value.endsWith(".")) value = value.substring(0,value.length()-1); // chomp trailing dot
+ String[] parts = value.split("\\(");
+ sciname = parts[0].trim();
+ if (parts.length>1) {
+ comname = parts[1].trim();
+ if (comname.endsWith(")")) comname = comname.substring(0,comname.length()-1); // chomp trailing bracket
+ if (parts.length>2) {
+ // synonyms
+ for (int j = 2 ; j < parts.length; j++) {
+ String syn = parts[j].trim();
+ if (syn.endsWith(")")) syn = syn.substring(0,syn.length()-1); // chomp trailing bracket
+ synonym.add(syn);
+ }
+ }
+ }
+ } else if (tag.equals(TAXON_TAG)) {
+ String[] parts = value.split(";");
+ for (int j = 0; j < parts.length; j++) {
+ String[] bits = parts[j].split("=");
+ if (bits[0].equals("NCBI_TaxID")) {
+ String[] morebits = bits[1].split(",");
+ taxid = Integer.parseInt(morebits[0].split(" ")[0].trim());
+ }
+ }
+ } else if (tag.equals(ORGANELLE_TAG)) {
+ if (value.endsWith(".")) value = value.substring(0,value.length()-1); // chomp trailing dot
+ String[] parts = value.split(";");
+ for (int j = 0; j < parts.length; j++) {
+ parts[j]=parts[j].trim();
+ rlistener.addSequenceProperty(Terms.getOrganelleTerm(),parts[j]);
+ }
+ }
+ //added by xiujuan 2016.5.12
+ else if(tag.equals(ORGANISM_TAG)){
+ if (value.endsWith(".")) value = value.substring(0,value.length()-1); // chomp trailing dot
+ String[] parts = value.split(";");
+ for (int j = 0; j < parts.length; j++) {
+ parts[j]=parts[j].trim();
+ lineage.add(parts[j]);
+ }
+ }else if(tag.equals(ORGANISM_HOST_TAG)) { //"OH"tag Organism Host
+ String[] parts = value.split("\\. ");
+ for(int j = 0; j < parts.length; j++){
+ rlistener.addSequenceProperty(Terms.getOrganismHostTerm(),parts[j]);
+ }
+ }
+ }
+ // Set the Taxon
+ tax = (NCBITaxon)RichObjectFactory.getObject(SimpleNCBITaxon.class, new Object[]{new Integer(taxid)});
+ rlistener.setTaxon(tax);
+ try {
+ if (sciname!=null) tax.addName(NCBITaxon.SCIENTIFIC,sciname);
+ if (comname!=null) tax.addName(NCBITaxon.COMMON,comname);
+ for (Iterator j = synonym.iterator(); j.hasNext(); ) tax.addName(NCBITaxon.SYNONYM, (String)j.next());
+ for(Iterator j = lineage.iterator();j.hasNext();)tax.addName("lineage",(String)j.next());
+ } catch (ChangeVetoException e) {
+ throw new ParseException(e);
+ }
+ } else if (sectionKey.equals(DATE_TAG)) {
+ String chunk = ((String[])section.get(0))[1];
+ if(subformat.equals(SUBFORMAT_UNIPROT)) {
+ Matcher dm = dp_uniprot.matcher(chunk);
+ if (dm.matches()) {
+ String date = dm.group(1).trim();
+ String type = dm.group(2).trim();
+ String rel = dm.group(3);
+ if (rel!=null) rel = rel.trim();
+ if (type.startsWith("integrated into UniProtKB")) {
+ String dbname = type.split("/")[1];
+ rlistener.addSequenceProperty(Terms.getDateCreatedTerm(), date);
+ rlistener.addSequenceProperty(Terms.getUniProtDBNameTerm(), dbname);
+ } else if (type.equalsIgnoreCase("sequence version")) {
+ if (rel==null){
+ String message = ParseException.newMessage(this.getClass(),accession, "", "Version missing for "+type, sectionToString(section));
+ throw new ParseException(message);
+ }
+ rlistener.addSequenceProperty(Terms.getDateUpdatedTerm(), date);
+ rlistener.setVersion(Integer.parseInt(rel));
+ } else if (type.equalsIgnoreCase("entry version")) {
+ if (rel==null) {
+ String message = ParseException.newMessage(this.getClass(),accession, "", "Version missing for "+type, sectionToString(section));
+ throw new ParseException(message);
+ }
+ rlistener.addSequenceProperty(Terms.getDateAnnotatedTerm(), date);
+ rlistener.addSequenceProperty(Terms.getRelAnnotatedTerm(), rel);
+ } else {
+ String message = ParseException.newMessage(this.getClass(),accession, "", "Bad date type "+type, sectionToString(section));
+ throw new ParseException(message);
+ }
+ } else {
+ String message = ParseException.newMessage(this.getClass(),accession, "", "Bad date line", sectionToString(section));
+ throw new ParseException(message);
+ }
+ } else if(subformat.equals(SUBFORMAT_IPI)) {
+ Matcher dm = dp_ipi.matcher(chunk);
+ if (dm.matches()) {
+ String date = dm.group(1).trim();
+ String type = dm.group(3).trim();
+ if(type.equals("Created")) {
+ rlistener.addSequenceProperty(Terms.getDateCreatedTerm(), date);
+ } else if(type.equals("Last sequence update")) {
+ rlistener.addSequenceProperty(Terms.getDateUpdatedTerm(), date);
+ } else {
+ String message = ParseException.newMessage(this.getClass(),accession, "", "Bad date type "+type, sectionToString(section));
+ throw new ParseException(message);
+ }
+ } else {
+ String message = ParseException.newMessage(this.getClass(),accession, "", "Bad date line", sectionToString(section));
+ throw new ParseException(message);
+ }
+ } else {
+ String message = ParseException.newMessage(this.getClass(),accession, "", "Unknown date line format", sectionToString(section));
+ throw new ParseException(message);
+ }
+ } else if (sectionKey.equals(ACCESSION_TAG)) {
+ // if multiple accessions, store only first as accession,
+ // and store rest in annotation
+ String[] accs = ((String[])section.get(0))[1].split(";");
+ if(accs.length>0) accession = accs[0].trim(); else accession = "";
+ rlistener.setAccession(accession);
+ for (int i = 1; i < accs.length; i++) {
+ rlistener.addSequenceProperty(Terms.getAdditionalAccessionTerm(),accs[i].trim());
+ }
+ } else if (sectionKey.equals(PROTEIN_EXIST_TAG)) {
+ String val = ((String[])section.get(0))[1];
+ if (val.endsWith(";")) val = val.substring(0, val.length()-1); // chomp semicolon
+ rlistener.addSequenceProperty(Terms.getProteinExistsTerm(),val.trim());
+ } else if (sectionKey.equals(KEYWORDS_TAG)) {
+ String val = ((String[])section.get(0))[1];
+ if (val.endsWith(".")) val = val.substring(0, val.length()-1); // chomp dot
+ val = val.replace('\n',' '); //remove newline
+ String[] kws = val.split(";");
+ for (int i = 0; i < kws.length; i++) {
+ String kw = kws[i].trim();
+ if (kw.length()==0) continue;
+ rlistener.addSequenceProperty(Terms.getKeywordTerm(), kw);
+ }
+ } else if (sectionKey.equals(GENE_TAG)) {
+ String[] genes = ((String[])section.get(0))[1].split("\\s+(or|and)\\s+");
+ for (int geneID = 0; geneID < genes.length; geneID++) {
+ String[] parts = genes[geneID].replace('\n', ' ').split(";");
+ for (int j = 0; j < parts.length; j++) {
+ if(parts[j].matches(".+=.+")){
+ String[] moreparts = parts[j].split("=");
+ String[] values = moreparts[1].split(",");
+ // nasty hack - we really should have notes on the gene object itself... if such a thing existed...
+ if (moreparts[0].trim().equals(Terms.GENENAME_KEY)) rlistener.addSequenceProperty(Terms.getGeneNameTerm(),geneID+":"+values[0].trim());
+ else if (moreparts[0].trim().equals(Terms.GENESYNONYM_KEY)) {
+ for (int k = 0; k < values.length; k++) rlistener.addSequenceProperty(Terms.getGeneSynonymTerm(),geneID+":"+values[k].trim());
+ } else if (moreparts[0].trim().equals(Terms.ORDLOCNAME_KEY)) {
+ for (int k = 0; k < values.length; k++) rlistener.addSequenceProperty(Terms.getOrderedLocusNameTerm(),geneID+":"+values[k].trim());
+ } else if (moreparts[0].trim().equals(Terms.ORFNAME_KEY)) {
+ for (int k = 0; k < values.length; k++) rlistener.addSequenceProperty(Terms.getORFNameTerm(),geneID+":"+values[k].trim());
+ }
+ }
+ }
+ }
+ } else if (sectionKey.equals(DATABASE_XREF_TAG)) {
+ // database_identifier; primary_identifier; secondary_identifier....
+ String val = ((String[])section.get(0))[1];
+ if (val.endsWith(".")) val = val.substring(0, val.length()-1); // chomp dot
+ String[] parts = val.split(";");
+ // construct a DBXREF out of the dbname part[0] and accession part[1]
+ String dbname = parts[0].trim();
+ String acc = parts[1].trim();
+ CrossRef crossRef = (CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{dbname,acc,new Integer(0)});
+ // assign remaining bits of info as additional accession annotations
+ for (int j = 2; j < parts.length; j++) {
+ ComparableTerm t = (ComparableTerm) Terms.getAdditionalAccessionTerm();
+ Note note = new SimpleNote(t,parts[j].trim(),j-1);
+ try {
+ crossRef.getRichAnnotation().addNote(note);
+ } catch (ChangeVetoException ce) {
+ ParseException pe = new ParseException("Could not annotate additional accession terms");
+ pe.initCause(ce);
+ throw pe;
+ }
+ }
+ RankedCrossRef rcrossRef = new SimpleRankedCrossRef(crossRef, 0);
+ rlistener.setRankedCrossRef(rcrossRef);
+ } else if (sectionKey.equals(REFERENCE_TAG) && !this.getElideReferences()) {
+ // first line of section has rank and location
+ String refrank = ((String[])section.get(0))[1];
+ refrank = refrank.trim().split(" ")[0];
+ int ref_rank = Integer.parseInt(refrank.substring(1,refrank.length()-1));
+ // rest can be in any order
+ String authors = null;
+ String consortium = null;
+ String title = null;
+ String locator = null;
+ String pubmed = null;
+ String medline = null;
+ String doi = null;
+ String remark = null;
+ Integer rstart = null;
+ Integer rend = null;
+ for (int i = 1; i < section.size(); i++) {
+ String key = ((String[])section.get(i))[0];
+ String val = ((String[])section.get(i))[1];
+ //System.err.println(key+": "+val);
+ if (key.equals(AUTHORS_TAG)) {
+ if (val.endsWith(";")) val = val.substring(0, val.length()-1); // chomp semicolon
+ authors = val.replace('\n',' '); //see #2276
+ }
+ if (key.equals(CONSORTIUM_TAG)) {
+ if (val.endsWith(";")) val = val.substring(0, val.length()-1); // chomp semicolon
+ consortium = val.replace('\n',' '); //see #2276
+ }
+ if (key.equals(TITLE_TAG)) {
+ if (val.endsWith(";")) val = val.substring(0, val.length()-1); // chomp semicolon
+ if (val.endsWith("\"")) val = val.substring(1, val.length()-1); // chomp quotes
+ title = val.replace('\n',' '); //see #2276
+ }
+ if (key.equals(LOCATION_TAG)) {
+ if (val.endsWith(".")) val = val.substring(0, val.length()-1); // chomp dot
+ locator = val.replace('\n',' '); //see #2276
+ }
+ if (key.equals(REFERENCE_XREF_TAG)) {
+ // database_identifier=primary_identifier;
+ String[] refs = val.split(";");
+ for (int j = 0 ; j < refs.length; j++) {
+ if (refs[j].trim().length()==0) continue;
+ String[] parts = refs[j].split("=");
+ if ( parts.length <2) {
+ // some DOI lines look like this and are causing problems:
+ //DOI=10.1002/(SICI)1097-0215(19990702)82:1<137::AID-IJC23>3.0.CO;2-F;ignoring
+ System.err.println("warning: problems while parsing: " + val);
+ continue;
+ }
+ String db = parts[0].trim();
+ String ref = parts[1].trim();
+ if (db.equalsIgnoreCase(Terms.PUBMED_KEY)) pubmed = ref;
+ else if (db.equalsIgnoreCase(Terms.MEDLINE_KEY)) medline = ref;
+ else if (db.equalsIgnoreCase(Terms.DOI_KEY)) doi = ref;
+ }
+ }
+ if (key.equals(RP_LINE_TAG)) {
+ if (val.endsWith(".")) val = val.substring(0, val.length()-1); // chomp dot
+ remark = val.replace('\n',' '); //see #2276
+ // Try to use it to find the location of the reference, if we have one.
+ Matcher m = rppat.matcher(val);
+ if (m.matches()) {
+ rstart = Integer.valueOf(m.group(1));
+ rend = Integer.valueOf(m.group(2));
+ }
+ }
+ if (key.equals(RC_LINE_TAG)) {
+ // Split into key=value pairs separated by semicolons and terminated with semicolon.
+ String[] parts = val.split(";");
+ for (int j = 0; j < parts.length; j++) {
+ String[] subparts = parts[j].split("=");
+ // get term for first section
+ String termName = subparts[0].trim();
+ Term t;
+ if (termName.equalsIgnoreCase(Terms.SPECIES_KEY)) t = Terms.getSpeciesTerm();
+ else if (termName.equalsIgnoreCase(Terms.STRAIN_KEY)) t = Terms.getStrainTerm();
+ else if (termName.equalsIgnoreCase(Terms.TISSUE_KEY)) t = Terms.getTissueTerm();
+ else if (termName.equalsIgnoreCase(Terms.TRANSPOSON_KEY)) t = Terms.getTransposonTerm();
+ else if (termName.equalsIgnoreCase(Terms.PLASMID_KEY)) t = Terms.getPlasmidTerm();
+ else {
+ String message = ParseException.newMessage(this.getClass(),accession, "", "Invalid RC term found: "+termName, sectionToString(section));
+ throw new ParseException(message);
+ }
+ // assign notes using term and rank:second section as value
+ // nasty hack - we really should have notes on the reference itself.
+ rlistener.addSequenceProperty("docref_" + t.toString(), ref_rank+":"+subparts[1].trim());
+ }
+ }
+ }
+
+ // create the docref object
+ try {
+ List auths = null;
+ if(authors != null) auths = DocRefAuthor.Tools.parseAuthorString(authors);
+ if (consortium!=null){
+ if(auths == null) auths = new ArrayList();
+ auths.add(new SimpleDocRefAuthor(consortium,true,false));
+ }
+ DocRef dr = (DocRef)RichObjectFactory.getObject(SimpleDocRef.class,new Object[]{auths,locator,title});
+
+ //save all Crossref to the sequence property
+ if (medline!=null) rlistener.addSequenceProperty("docref_"+"medline", ref_rank+":"+medline);
+ if (pubmed!=null) rlistener.addSequenceProperty("docref_"+"pubmed", ref_rank+":"+pubmed);
+ if (doi!=null) rlistener.addSequenceProperty("docref_"+"doi", ref_rank+":"+doi);
+ // assign either the pubmed or medline to the docref - medline gets priority, then pubmed, then doi
+// if (medline!=null) dr.setCrossref((CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{Terms.MEDLINE_KEY, medline, new Integer(0)}));
+// else if (pubmed!=null) dr.setCrossref((CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{Terms.PUBMED_KEY, pubmed, new Integer(0)}));
+// else if (doi!=null) dr.setCrossref((CrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{Terms.DOI_KEY, doi, new Integer(0)}));
+ // assign the remarks
+ if (!this.getElideComments()) dr.setRemark(remark);
+ // assign the docref to the bioentry
+ RankedDocRef rdr = new SimpleRankedDocRef(dr,rstart,rend,ref_rank);
+ rlistener.setRankedDocRef(rdr);
+ } catch (ChangeVetoException e) {
+ throw new ParseException(e);
+ }
+ } else if (sectionKey.equals(COMMENT_TAG) && !this.getElideComments()) {
+ // Set up some comments
+ String val = ((String[])section.get(0))[1];
+ if (UniProtCommentParser.isParseable(val)) rlistener.setComment(val);
+ else {
+ // copyright message
+ rlistener.addSequenceProperty(Terms.getCopyrightTerm(), val);
+ }
+ } else if (sectionKey.equals(FEATURE_TAG) && !this.getElideFeatures()) {
+ // starting from second line of input, start a new feature whenever we come across
+ // a key that does not start with /
+ boolean seenAFeature = false;
+ for (int i = 1 ; i < section.size(); i++) {
+ String key = ((String[])section.get(i))[0];
+ String val = ((String[])section.get(i))[1];
+ val = val.replaceAll("\\s*[\\n\\r]+\\s*", " ").trim();
+ if (val.endsWith(".")) val = val.substring(0,val.length()-1); // chomp dot
+ if (key.startsWith("/")) {
+ key = key.substring(1); // strip leading slash
+ if (key.equals("FTId")) rlistener.addFeatureProperty(Terms.getFTIdTerm(),val);
+ else {
+ // should never happen - but here just in case
+ rlistener.addFeatureProperty(RichObjectFactory.getDefaultOntology().getOrCreateTerm(key),val);
+ }
+ } else {
+ // new feature!
+ // end previous feature
+ if (seenAFeature) rlistener.endFeature();
+ // start next one, with lots of lovely info in it
+ RichFeature.Template templ = new RichFeature.Template();
+ templ.annotation = new SimpleRichAnnotation();
+ templ.sourceTerm = Terms.getUniProtTerm();
+ templ.typeTerm = RichObjectFactory.getDefaultOntology().getOrCreateTerm(key);
+ templ.featureRelationshipSet = new TreeSet();
+ templ.rankedCrossRefs = new TreeSet();
+ String desc = null;
+ Matcher m = fp.matcher(val);
+ if (m.matches()) {
+ String loc = m.group(1);
+ desc = m.group(3);
+ templ.location = UniProtLocationParser.parseLocation(loc);
+ } else {
+ String message = ParseException.newMessage(this.getClass(),accession, "", "Bad feature value: "+val, sectionToString(section));
+ throw new ParseException(message);
+ }
+ rlistener.startFeature(templ);
+ if (desc!=null && desc.length()>0) rlistener.addFeatureProperty(Terms.getFeatureDescTerm(),desc);
+ seenAFeature = true;
+ }
+ }
+ if (seenAFeature) rlistener.endFeature();
+ } else if (sectionKey.equals(START_SEQUENCE_TAG) && !this.getElideSymbols()) {
+ StringBuffer seq = new StringBuffer();
+
+ for (int i = 0 ; i < section.size()-1; i++) seq.append(((String[])section.get(i))[1]);
+ String seqMetaInfo = ((String[])section.get(section.size()-1))[1];
+ rlistener.addSequenceProperty(Terms.getSequenceMetaInfoTerm(), seqMetaInfo);
+ //section size greater than 1?
+ try {
+ SymbolList sl = new SimpleSymbolList(symParser,
+ seq.toString().replaceAll("\\s+","").replaceAll("[\\.|~]","-"));
+ rlistener.addSymbols(symParser.getAlphabet(),
+ (Symbol[])(sl.toList().toArray(new Symbol[0])),
+ 0, sl.length());
+ } catch (IllegalAlphabetException e) {
+ String message = ParseException.newMessage(this.getClass(),accession, "", "", sectionToString(section));
+ throw new ParseException(e, message);
+ }
+ }
+ } while (!sectionKey.equals(END_SEQUENCE_TAG));
+ }catch (RuntimeException e){
+ String message = ParseException.newMessage(this.getClass(),accession, "", "", sectionToString(section));
+ throw new ParseException(e, message);
+ }
+
+ // Allows us to tolerate trailing whitespace without
+ // thinking that there is another Sequence to follow
+ while (true) {
+ reader.mark(1);
+ int c = reader.read();
+ if (c == -1) {
+ hasAnotherSequence = false;
+ break;
+ }
+ if (Character.isWhitespace((char) c)) {
+ //hasInternalWhitespace = true;
+ continue;
+ }
+ //if (hasInternalWhitespace)
+ //System.err.println("Warning: whitespace found between sequence entries");
+ reader.reset();
+ break;
+ }
+
+ // Finish up.
+ rlistener.endSequence();
+ return hasAnotherSequence;
+ }
+
+ // reads an indented section, combining split lines and creating a list of key->value tuples
+ private List readSection(BufferedReader br) throws ParseException {
+ List section = new ArrayList();
+ String line;
+ boolean done = false;
+
+ // while not done
+ try {
+ while (!done) {
+ // mark buffer
+ br.mark(320);
+ // read token
+ line = br.readLine();
+ if (line.length()<2) {
+ String message = ParseException.newMessage(this.getClass(),accession, "", "Bad line found: "+line, sectionToString(section));
+ throw new ParseException(message);
+ }
+ String token = line.substring(0,2);
+ // READ SEQUENCE SECTION
+ if (token.equals(START_SEQUENCE_TAG)) {
+ // from next line, read sequence until // - leave // on stack
+ StringBuffer sb = new StringBuffer();
+ String sequence_meta_info = line.substring(5);
+ while (!done) {
+ br.mark(160);
+ line = br.readLine();
+ if (line.startsWith(END_SEQUENCE_TAG)) {
+ br.reset();
+ done = true;
+ } else {
+ // create sequence tag->value pair to return, sans numbers
+ sb.append(line);
+ }
+ }
+ section.add(new String[]{START_SEQUENCE_TAG,sb.toString()});
+ section.add(new String[]{"Sequence_Meta_Info", sequence_meta_info});
+ }
+ // READ COMMENT SECTION
+ else if (token.equals(COMMENT_TAG)) {
+ // read from first line till next that begins with "CC -!-"
+ StringBuffer currentVal = new StringBuffer();
+ boolean wasMisc = false;
+ if (!line.startsWith(COMMENT_TAG+" -!-")) wasMisc = true;
+ currentVal.append(line.substring(5));
+ while (!done) {
+ br.mark(160);
+ line = br.readLine();
+ if (((!wasMisc) && line.charAt(5)!=' ') || !line.startsWith("C") || line.startsWith(COMMENT_TAG+" -!-")) {
+ br.reset();
+ done = true;
+ // dump current tag if exists
+ section.add(new String[]{COMMENT_TAG,currentVal.toString()});
+ } else {
+ currentVal.append("\n");
+ currentVal.append(line.substring(5));
+ }
+ }
+ }
+ // READ FEATURE TABLE SECTION
+ else if (token.equals(FEATURE_TAG)) {
+ br.reset();
+ // read all FT lines until first non-FT starting line
+ String currentTag = null;
+ StringBuffer currentVal = new StringBuffer();
+ section.add(new String[]{FEATURE_TAG,null});
+ while (!done) {
+ br.mark(160);
+ line = br.readLine();
+ if (!line.startsWith(FEATURE_TAG)) {
+ br.reset();
+ done = true;
+ // dump current tag if exists
+ if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()});
+ } else {
+ // FT lines: FT KEY_NAME x x description
+ // or: FT ....
+ // or FT /FTId=899.
+ line = line.substring(5); // chomp off "FT "
+ if (!line.startsWith(" ")) {
+ // dump current tag if exists
+ if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()});
+ // case 1 : word value - splits into key-value based on first 8 chars
+ currentTag = line.substring(0,8).trim();
+ currentVal = new StringBuffer();
+ currentVal.append(line.substring(8).trim());
+ } else {
+ line = line.trim();
+ if (line.startsWith("/") && line.indexOf("=") != -1) {
+ // dump current tag if exists
+ if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()});
+ // case 3 : /word=.....
+ currentVal = new StringBuffer();
+ int equalIndex = line.indexOf('=');
+ if (equalIndex>=0) {
+ currentTag = line.substring(0, equalIndex);
+ currentVal.append(line.substring(equalIndex+1));
+ } else {
+ currentTag = line;
+ }
+ } else {
+ // case 2 : ...."
+ currentVal.append("\n");
+ currentVal.append(line);
+ }
+ }
+ }
+ }
+ }
+ // READ DOCREF
+ else if (token.equals(DATABASE_XREF_TAG)) {
+ section.add(new String[]{DATABASE_XREF_TAG,line.substring(5).trim()});
+ done = true;
+ }
+ // READ DATE
+ else if (token.equals(DATE_TAG)) {
+ section.add(new String[]{DATE_TAG,line.substring(5).trim()});
+ done = true;
+ }
+ // READ END OF SEQUENCE
+ else if (token.equals(END_SEQUENCE_TAG)) {
+ section.add(new String[]{END_SEQUENCE_TAG,null});
+ done = true;
+ }
+ // READ NORMAL TAG/VALUE SECTION
+ else {
+ // rewind buffer to mark
+ br.reset();
+ // read token/values until first with non-same first character
+ // exceptions: DE/DT, and RN...RN
+ String currentTag = null;
+ char currentTagStart = '\0';
+ StringBuffer currentVal = null;
+ while (!done) {
+ br.mark(320);
+ line = br.readLine();
+ if (currentTagStart=='\0') currentTagStart = line.charAt(0);
+ if (!line.startsWith(""+currentTagStart) ||
+ (currentTagStart=='D' && currentTag!=null && !line.startsWith(""+currentTag)) ||
+ (currentTagStart=='R' && currentTag!=null && line.startsWith("RN"))) {
+ br.reset();
+ done = true;
+ // dump current tag if exists
+ if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()});
+ } else {
+ try {
+ // merge neighbouring repeated tokens by concatting values
+ // return tag->value pairs
+ String tag = line.substring(0,2);
+ String value = line.substring(5);
+ if (currentTag==null || !tag.equals(currentTag)) {
+ // dump current tag if exists
+ if (currentTag!=null) section.add(new String[]{currentTag,currentVal.toString()});
+ // start new tag
+ currentTag = tag;
+ currentVal = new StringBuffer();
+ currentVal.append(value);
+ } else {
+ currentVal.append("\n");
+ currentVal.append(value);
+ }
+ } catch (Exception e) {
+ String message = ParseException.newMessage(this.getClass(),accession, "", "", sectionToString(section));
+ throw new ParseException(e, message);
+ }
+ }
+ }
+ }
+ }
+ } catch (IOException e) {
+ String message = ParseException.newMessage(this.getClass(),accession, "", "", sectionToString(section));
+ throw new ParseException(e, message);
+ } catch (RuntimeException e){
+ String message = ParseException.newMessage(this.getClass(),accession, "", "", sectionToString(section));
+ throw new ParseException(e, message);
+ }
+ return section;
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ public void writeSequence(Sequence seq, PrintStream os) throws IOException {
+ if (this.getPrintStream()==null) this.setPrintStream(os);
+ this.writeSequence(seq, RichObjectFactory.getDefaultNamespace());
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ public void writeSequence(Sequence seq, String format, PrintStream os) throws IOException {
+ if (this.getPrintStream()==null) this.setPrintStream(os);
+ if (!format.equals(this.getDefaultFormat())) throw new IllegalArgumentException("Unknown format: "+format);
+ this.writeSequence(seq, RichObjectFactory.getDefaultNamespace());
+ }
+
+ /**
+ * {@inheritDoc}
+ * Namespace is ignored as UniProt has no concept of it.
+ */
+ public void writeSequence(Sequence seq, Namespace ns) throws IOException {
+ RichSequence rs;
+ try {
+ if (seq instanceof RichSequence) rs = (RichSequence)seq;
+ else rs = RichSequence.Tools.enrich(seq);
+ } catch (ChangeVetoException e) {
+ IOException e2 = new IOException("Unable to enrich sequence");
+ e2.initCause(e);
+ throw e2;
+ }
+
+ SymbolTokenization tok;
+ try {
+ tok = rs.getAlphabet().getTokenization("token");
+ } catch (Exception e) {
+ throw new RuntimeException("Unable to get alphabet tokenizer",e);
+ }
+
+ Set notes = rs.getNoteSet();
+ String accession = rs.getAccession();
+ StringBuffer accessions = new StringBuffer();
+ accessions.append(accession);
+ accessions.append(";");
+ String cdat = null;
+ String udat = null;
+ String adat = null;
+ String dbname = "?";
+ String arel = null;
+ String organelle = null;
+ String protExists = null;
+ String dataclass = "STANDARD";
+ String copyright = null;
+ Map speciesRecs = new TreeMap();
+ Map strainRecs = new TreeMap();
+ Map tissueRecs = new TreeMap();
+ Map transpRecs = new TreeMap();
+ Map plasmidRecs = new TreeMap();
+ Map genenames = new TreeMap();
+ Map genesynonyms = new TreeMap();
+ Map orfnames = new TreeMap();
+ Map ordlocnames = new TreeMap();
+ for (Iterator i = notes.iterator(); i.hasNext(); ) {
+ Note n = i.next();
+ if (n.getTerm().equals(Terms.getDateCreatedTerm())) cdat=n.getValue();
+ else if (n.getTerm().equals(Terms.getDateUpdatedTerm())) udat=n.getValue();
+ else if (n.getTerm().equals(Terms.getDateAnnotatedTerm())) adat=n.getValue();
+ else if (n.getTerm().equals(Terms.getUniProtDBNameTerm())) dbname=n.getValue();
+ else if (n.getTerm().equals(Terms.getProteinExistsTerm())) protExists=n.getValue();
+ else if (n.getTerm().equals(Terms.getRelAnnotatedTerm())) arel=n.getValue();
+ else if (n.getTerm().equals(Terms.getDataClassTerm())) dataclass = n.getValue();
+ else if (n.getTerm().equals(Terms.getCopyrightTerm())) copyright = n.getValue();
+ else if (n.getTerm().equals(Terms.getAdditionalAccessionTerm())) {
+ accessions.append(" ");
+ accessions.append(n.getValue());
+ accessions.append(";");
+ } else if (n.getTerm().equals(Terms.getOrganelleTerm())) organelle = (organelle==null?"":organelle+"; ")+n.getValue();
+ // use the nasty hack to split the reference rank away from the actual value in this field
+ else if (n.getTerm().equals(Terms.getGeneNameTerm())) {
+ String ref = n.getValue();
+ int colon = ref.indexOf(':');
+ Integer refID = new Integer(0);
+ if (colon>=1) refID = new Integer(ref.substring(0,colon));
+ genenames.put(refID, ref.substring(colon+1)); // map of id -> string as only one name per gene
+ } else if (n.getTerm().equals(Terms.getGeneSynonymTerm())) {
+ String ref = n.getValue();
+ int colon = ref.indexOf(':');
+ Integer refID = new Integer(0);
+ if (colon>=1) refID = new Integer(ref.substring(0,colon));
+ if (genesynonyms.get(refID)==null) genesynonyms.put(refID, new ArrayList());
+ ((List)genesynonyms.get(refID)).add(ref.substring(colon+1));
+ } else if (n.getTerm().equals(Terms.getOrderedLocusNameTerm())) {
+ String ref = n.getValue();
+ int colon = ref.indexOf(':');
+ Integer refID = new Integer(0);
+ if (colon>=1) refID = new Integer(ref.substring(0,colon));
+ if (ordlocnames.get(refID)==null) ordlocnames.put(refID, new ArrayList());
+ ((List)ordlocnames.get(refID)).add(ref.substring(colon+1));
+ } else if (n.getTerm().equals(Terms.getORFNameTerm())) {
+ String ref = n.getValue();
+ int colon = ref.indexOf(':');
+ Integer refID = new Integer(0);
+ if (colon>=1) refID = new Integer(ref.substring(0,colon));
+ if (orfnames.get(refID)==null) orfnames.put(refID, new ArrayList());
+ ((List)orfnames.get(refID)).add(ref.substring(colon+1));
+ }
+ // use the nasty hack to split the reference rank away from the actual value in this field
+ // we'll end up with a bunch in key 0 for those which did not come from us. We ignore these for now.
+ else if (n.getTerm().equals(Terms.getSpeciesTerm())) {
+ String ref = n.getValue();
+ int colon = ref.indexOf(':');
+ Integer refID = new Integer(0);
+ if (colon>=1) refID = new Integer(ref.substring(0,colon));
+ if (speciesRecs.get(refID)==null) speciesRecs.put(refID, new ArrayList());
+ ((List)speciesRecs.get(refID)).add(ref.substring(colon+1));
+ } else if (n.getTerm().equals(Terms.getStrainTerm())) {
+ String ref = n.getValue();
+ int colon = ref.indexOf(':');
+ Integer refID = new Integer(0);
+ if (colon>=1) refID = new Integer(ref.substring(0,colon));
+ if (strainRecs.get(refID)==null) strainRecs.put(refID, new ArrayList());
+ ((List)strainRecs.get(refID)).add(ref.substring(colon+1));
+ } else if (n.getTerm().equals(Terms.getTissueTerm())) {
+ String ref = n.getValue();
+ int colon = ref.indexOf(':');
+ Integer refID = new Integer(0);
+ if (colon>=1) refID = new Integer(ref.substring(0,colon));
+ if (tissueRecs.get(refID)==null) tissueRecs.put(refID, new ArrayList());
+ ((List)tissueRecs.get(refID)).add(ref.substring(colon+1));
+ } else if (n.getTerm().equals(Terms.getTransposonTerm())) {
+ String ref = n.getValue();
+ int colon = ref.indexOf(':');
+ Integer refID = new Integer(0);
+ if (colon>=1) refID = new Integer(ref.substring(0,colon));
+ if (transpRecs.get(refID)==null) transpRecs.put(refID, new ArrayList());
+ ((List)transpRecs.get(refID)).add(ref.substring(colon+1));
+ } else if (n.getTerm().equals(Terms.getPlasmidTerm())) {
+ String ref = n.getValue();
+ int colon = ref.indexOf(':');
+ Integer refID = new Integer(0);
+ if (colon>=1) refID = new Integer(ref.substring(0,colon));
+ if (plasmidRecs.get(refID)==null) plasmidRecs.put(refID, new ArrayList());
+ ((List)plasmidRecs.get(refID)).add(ref.substring(colon+1));
+ }
+ }
+
+ // entryname dataclass; [circular] molecule; division; sequencelength BP.
+ StringBuffer locusLine = new StringBuffer();
+ locusLine.append(StringTools.rightPad(rs.getName()+"_"+rs.getDivision(),12));
+ locusLine.append(" ");
+ locusLine.append(StringTools.leftPad(dataclass,19));
+ //locusLine.append("; PRT; "); //Uniprot no longer uses the PRT;
+ locusLine.append("; ");
+ locusLine.append(StringTools.leftPad(""+rs.length(),11));
+ locusLine.append(" AA.");
+ StringTools.writeKeyValueLine(LOCUS_TAG, locusLine.toString(), 5, this.getLineWidth(), null, LOCUS_TAG, this.getPrintStream());
+
+ // accession line
+ StringTools.writeKeyValueLine(ACCESSION_TAG, accessions.toString(), 5, this.getLineWidth(), null, ACCESSION_TAG, this.getPrintStream());
+
+ // date line
+ StringTools.writeKeyValueLine(DATE_TAG, (cdat==null?udat:cdat)+", integrated into UniProtKB/"+dbname+".", 5, this.getLineWidth(), null, DATE_TAG, this.getPrintStream());
+ StringTools.writeKeyValueLine(DATE_TAG, udat+", sequence version "+rs.getVersion()+".", 5, this.getLineWidth(), null, DATE_TAG, this.getPrintStream());
+ StringTools.writeKeyValueLine(DATE_TAG, (adat==null?udat:adat)+", entry version "+(arel==null?"0":arel)+".", 5, this.getLineWidth(), null, DATE_TAG, this.getPrintStream());
+
+ // definition line
+ StringTools.writeKeyValueLine(DEFINITION_TAG, rs.getDescription()+".", 5, this.getLineWidth(), null, DEFINITION_TAG, this.getPrintStream());
+
+ // gene line
+ for (Iterator i = genenames.keySet().iterator(); i.hasNext(); ) {
+ Integer geneid = (Integer)i.next();
+ String genename = (String)genenames.get(geneid);
+ List synonyms = (List)genesynonyms.get(geneid);
+ List orfs = (List)orfnames.get(geneid);
+ List ordlocs = (List)ordlocnames.get(geneid);
+
+ StringBuffer gnline = new StringBuffer();
+ gnline.append(Terms.GENENAME_KEY);
+ gnline.append("=");
+ gnline.append(genename);
+ gnline.append("; ");
+
+ if (synonyms!=null) {
+ gnline.append(Terms.GENESYNONYM_KEY);
+ gnline.append("=");
+ for (Iterator j = synonyms.iterator(); j.hasNext(); ) {
+ gnline.append((String)j.next());
+ if (j.hasNext()) gnline.append(", ");
+ }
+ gnline.append("; ");
+ }
+ if (ordlocs!=null) {
+ gnline.append(Terms.ORDLOCNAME_KEY);
+ gnline.append("=");
+ for (Iterator j = ordlocs.iterator(); j.hasNext(); ) {
+ gnline.append((String)j.next());
+ if (j.hasNext()) gnline.append(", ");
+ }
+ gnline.append("; ");
+ }
+ if (orfs!=null) {
+ gnline.append(Terms.ORFNAME_KEY);
+ gnline.append("=");
+ for (Iterator j = orfs.iterator(); j.hasNext(); ) {
+ gnline.append((String)j.next());
+ if (j.hasNext()) gnline.append(", ");
+ }
+ gnline.append("; ");
+ }
+
+ StringTools.writeKeyValueLine(GENE_TAG, gnline.toString(), 5, this.getLineWidth(), null, GENE_TAG, this.getPrintStream());
+
+ if (i.hasNext()) StringTools.writeKeyValueLine(GENE_TAG, "and", 5, this.getLineWidth(), null, GENE_TAG, this.getPrintStream());
+ }
+
+ // source line (from taxon)
+ // organism line
+ NCBITaxon tax = rs.getTaxon();
+ if (tax!=null) {
+ StringBuffer source = new StringBuffer();
+ source.append(tax.getDisplayName());
+ for (Iterator j = tax.getNames(NCBITaxon.SYNONYM).iterator(); j.hasNext(); ) {
+ source.append(" (");
+ source.append((String)j.next());
+ source.append(")");
+ }
+ source.append(".");
+ StringTools.writeKeyValueLine(SOURCE_TAG, source.toString(), 5, this.getLineWidth(), null, SOURCE_TAG, this.getPrintStream());
+ if (organelle!=null) StringTools.writeKeyValueLine(ORGANELLE_TAG, organelle+".", 5, this.getLineWidth(), null, ORGANELLE_TAG, this.getPrintStream());
+ StringTools.writeKeyValueLine(ORGANISM_TAG, tax.getNameHierarchy(), 5, this.getLineWidth(), null, ORGANISM_TAG, this.getPrintStream());
+ StringTools.writeKeyValueLine(TAXON_TAG, "NCBI_TaxID="+tax.getNCBITaxID()+";", 5, this.getLineWidth(), this.getPrintStream());
+ }
+
+ // references - rank (bases x to y)
+ for (Iterator r = rs.getRankedDocRefs().iterator(); r.hasNext(); ) {
+ RankedDocRef rdr = r.next();
+ DocRef d = rdr.getDocumentReference();
+ // RN, RP, RC, RX, RG, RA, RT, RL
+ StringTools.writeKeyValueLine(REFERENCE_TAG, "["+rdr.getRank()+"]", 5, this.getLineWidth(), null, REFERENCE_TAG, this.getPrintStream());
+ if (d.getRemark()!=null)
+ StringTools.writeKeyValueLine(RP_LINE_TAG, d.getRemark()+".", 5, this.getLineWidth(), null, RP_LINE_TAG, this.getPrintStream());
+ // Print out ref position if present
+ if (rdr.getStart()!=null && rdr.getEnd()!=null && d.getRemark()!=null && !rppat.matcher(d.getRemark()).matches()) StringTools.writeKeyValueLine(RP_LINE_TAG, "SEQUENCE OF "+rdr.getStart()+"-"+rdr.getEnd()+".", 5, this.getLineWidth(), null, RP_LINE_TAG, this.getPrintStream());
+ // RC lines
+ StringBuffer rcline = new StringBuffer();
+ Integer rank = new Integer(rdr.getRank());
+ if (speciesRecs.get(rank)!=null) {
+ rcline.append(Terms.SPECIES_KEY);
+ rcline.append("=");
+ for (Iterator i = ((List)speciesRecs.get(rank)).iterator(); i.hasNext(); ) {
+ rcline.append((String)i.next());
+ if (i.hasNext()) rcline.append(", ");
+ }
+ rcline.append("; ");
+ }
+ if (strainRecs.get(rank)!=null) {
+ rcline.append(Terms.STRAIN_KEY);
+ rcline.append("=");
+ for (Iterator i = ((List)strainRecs.get(rank)).iterator(); i.hasNext(); ) {
+ rcline.append((String)i.next());
+ if (i.hasNext()) rcline.append(", ");
+ }
+ rcline.append("; ");
+ }
+ if (tissueRecs.get(rank)!=null) {
+ rcline.append(Terms.TISSUE_KEY);
+ rcline.append("=");
+ for (Iterator i = ((List)tissueRecs.get(rank)).iterator(); i.hasNext(); ) {
+ rcline.append((String)i.next());
+ if (i.hasNext()) rcline.append(", ");
+ }
+ rcline.append("; ");
+ }
+ if (transpRecs.get(rank)!=null) {
+ rcline.append(Terms.TRANSPOSON_KEY);
+ rcline.append("=");
+ for (Iterator i = ((List)transpRecs.get(rank)).iterator(); i.hasNext(); ) {
+ rcline.append((String)i.next());
+ if (i.hasNext()) rcline.append(", ");
+ }
+ rcline.append("; ");
+ }
+ if (plasmidRecs.get(rank)!=null) {
+ rcline.append(Terms.PLASMID_KEY);
+ rcline.append("=");
+ for (Iterator i = ((List)plasmidRecs.get(rank)).iterator(); i.hasNext(); ) {
+ rcline.append((String)i.next());
+ if (i.hasNext()) rcline.append(", ");
+ }
+ rcline.append("; ");
+ }
+ // print the rcline
+ if (rcline.length()>0) StringTools.writeKeyValueLine(RC_LINE_TAG, rcline.toString(), 5, this.getLineWidth(), null, RC_LINE_TAG, this.getPrintStream());
+ // Deal with RX and rest
+ CrossRef c = d.getCrossref();
+ if (c!=null) StringTools.writeKeyValueLine(REFERENCE_XREF_TAG, c.getDbname()+"="+c.getAccession()+";", 5, this.getLineWidth(), null, REFERENCE_XREF_TAG, this.getPrintStream());
+ List auths = d.getAuthorList();
+ for (Iterator j = auths.iterator(); j.hasNext(); ) {
+ DocRefAuthor a = j.next();
+ if (a.isConsortium()) {
+ StringTools.writeKeyValueLine(CONSORTIUM_TAG, a.getName()+";", 5, this.getLineWidth(), null, CONSORTIUM_TAG, this.getPrintStream());
+ j.remove();
+ }
+ }
+ if (!auths.isEmpty()) StringTools.writeKeyValueLine(AUTHORS_TAG, DocRefAuthor.Tools.generateAuthorString(auths, false)+";", 5, this.getLineWidth(), null, AUTHORS_TAG, this.getPrintStream());
+ if (d.getTitle()!=null && d.getTitle().length()!=0) StringTools.writeKeyValueLine(TITLE_TAG, "\""+d.getTitle()+"\";", 5, this.getLineWidth(), null, TITLE_TAG, this.getPrintStream());
+ StringTools.writeKeyValueLine(LOCATION_TAG, d.getLocation()+".", 5, this.getLineWidth(), null, LOCATION_TAG, this.getPrintStream());
+ }
+
+ // comments - if any
+ if (!rs.getComments().isEmpty()) {
+ for (Iterator i = rs.getComments().iterator(); i.hasNext(); ) {
+ Comment c = i.next();
+ String text = c.getComment().trim();
+ if (text.length()>3 && text.substring(0,3).equals("-!-")) StringTools.writeKeyValueLine(COMMENT_TAG, text, 5, this.getLineWidth(), null, COMMENT_TAG, this.getPrintStream());
+ else StringTools.writeKeyValueLine(COMMENT_TAG, text, 5, this.getLineWidth(), null, COMMENT_TAG, this.getPrintStream());
+ }
+ }
+
+ // copyright - if any
+ if (copyright!=null)
+ StringTools.writeKeyValueLine(COMMENT_TAG, copyright, 5, this.getLineWidth(), null, COMMENT_TAG, this.getPrintStream());
+
+ // db references - ranked
+ for (Iterator r = rs.getRankedCrossRefs().iterator(); r.hasNext(); ) {
+ RankedCrossRef rcr = r.next();
+ CrossRef c = rcr.getCrossRef();
+ Set noteset = c.getNoteSet();
+ StringBuffer sb = new StringBuffer();
+ sb.append(c.getDbname());
+ sb.append("; ");
+ sb.append(c.getAccession());
+ boolean hasSecondary = false;
+ for (Iterator i = noteset.iterator(); i.hasNext(); ) {
+ Note n = i.next();
+ if (n.getTerm().equals(Terms.getAdditionalAccessionTerm())) {
+ sb.append("; ");
+ sb.append(n.getValue());
+ hasSecondary = true;
+ }
+ }
+ if (!hasSecondary) sb.append("; -");
+ sb.append(".");
+ StringTools.writeKeyValueLine(DATABASE_XREF_TAG, sb.toString(), 5, this.getLineWidth(), null, DATABASE_XREF_TAG, this.getPrintStream());
+ }
+
+ // protein exists line
+ if (protExists!=null) {
+ StringTools.writeKeyValueLine(PROTEIN_EXIST_TAG, protExists+";", 5, this.getLineWidth(), null, PROTEIN_EXIST_TAG, this.getPrintStream());
+ }
+
+ // keywords line
+ String keywords = null;
+ for (Iterator n = notes.iterator(); n.hasNext(); ) {
+ Note nt = n.next();
+ if (nt.getTerm().equals(Terms.getKeywordTerm())) {
+ if (keywords==null) keywords = nt.getValue();
+ else keywords = keywords+"; "+nt.getValue();
+ }
+ }
+ if (keywords!=null) {
+ StringTools.writeKeyValueLine(KEYWORDS_TAG, keywords+".", 5, this.getLineWidth(), null, KEYWORDS_TAG, this.getPrintStream());
+ }
+
+ // feature_type location
+ for (Iterator i = rs.getFeatureSet().iterator(); i.hasNext(); ) {
+ RichFeature f = (RichFeature)i.next();
+ String desc = "";
+ String ftid = null;
+ for (Iterator j = f.getNoteSet().iterator(); j.hasNext(); ) {
+ Note n = j.next();
+ if (n.getTerm().equals(Terms.getFTIdTerm())) ftid = n.getValue();
+ else if (n.getTerm().equals(Terms.getFeatureDescTerm())) desc = n.getValue();
+ }
+ String kw = f.getTypeTerm().getName();
+ String leader = StringTools.rightPad(kw,8)+" "+UniProtLocationParser.writeLocation((RichLocation)f.getLocation());
+ if(desc.length()==0)
+ this.getPrintStream().println(FEATURE_TAG+" "+leader); //see #2277
+ else
+ StringTools.writeKeyValueLine(FEATURE_TAG+" "+leader, desc+".", 34, this.getLineWidth(), null, FEATURE_TAG, this.getPrintStream());
+ if (ftid!=null) StringTools.writeKeyValueLine(FEATURE_TAG, "/FTId="+ftid+".", 34, this.getLineWidth(), null, FEATURE_TAG, this.getPrintStream());
+ }
+
+ // sequence header
+ int mw = 0;
+ try {
+ mw = (int) MassCalc.getMolecularWeight(rs);
+ } catch (IllegalSymbolException e) {
+ throw new RuntimeException("Found illegal symbol", e);
+ }
+ CRC64Checksum crc = new CRC64Checksum();
+ String seqstr = rs.seqString();
+ crc.update(seqstr.getBytes(),0,seqstr.length());
+ this.getPrintStream().print(START_SEQUENCE_TAG+" SEQUENCE "+StringTools.leftPad(""+rs.length(),4)+" AA; ");
+ this.getPrintStream().print(StringTools.leftPad(""+mw,5)+" MW; ");
+ this.getPrintStream().println(crc+" CRC64;");
+
+ // sequence stuff
+ Symbol[] syms = (Symbol[])rs.toList().toArray(new Symbol[0]);
+ int symCount = 0;
+ this.getPrintStream().print(" ");
+ for (int i = 0; i < syms.length; i++) {
+ if (symCount % 60 == 0 && symCount>0) {
+ this.getPrintStream().print("\n ");
+ }
+ if (symCount % 10 == 0) {
+ this.getPrintStream().print(" ");
+ }
+ try {
+ this.getPrintStream().print(tok.tokenizeSymbol(syms[i]));
+ } catch (IllegalSymbolException e) {
+ throw new RuntimeException("Found illegal symbol: "+syms[i]);
+ }
+ symCount++;
+ }
+ this.getPrintStream().print("\n");
+ this.getPrintStream().println(END_SEQUENCE_TAG);
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ public String getDefaultFormat() {
+ return UNIPROT_FORMAT;
+ }
+
+ /**
+ * Converts the current parse section to a String. Useful for debugging.
+ */
+ String sectionToString(List section){
+ StringBuffer parseBlock = new StringBuffer();
+ for(Iterator i = section.listIterator(); i.hasNext();){
+ String[] part = (String[])i.next();
+ for(int x = 0; x < part.length; x++){
+ parseBlock.append(part[x]);
+ if(x == 0){
+ parseBlock.append(" "); //the gap will have been trimmed
+ }
+ }
+ }
+ return parseBlock.toString();
+ }
+}
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/util/ProcessNew.java b/piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/util/ProcessNew.java
new file mode 100644
index 0000000..9a080c9
--- /dev/null
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/microorganism/util/ProcessNew.java
@@ -0,0 +1,571 @@
+package cn.piflow.bundle.microorganism.util;
+
+
+
+import org.biojava.bio.seq.Feature;
+import org.biojavax.*;
+import org.biojavax.bio.seq.RichFeature;
+import org.biojavax.bio.seq.RichSequence;
+import org.biojavax.ontology.SimpleComparableTerm;
+import org.json.JSONArray;
+import org.json.JSONObject;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.util.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Created by xiujuan on 2016/3/24.
+ */
+public class ProcessNew {
+
+ static final Logger logger = LoggerFactory.getLogger(ProcessNew.class);
+ static final Pattern dp = Pattern.compile("(\\d{4})");
+ static final Pattern llp = Pattern.compile("(\\S+)\\s([SN])\\s(\\S+)\\s([WE])");
+ static final Pattern submitDatep = Pattern.compile("^Submitted\\s+\\((\\S+)\\)\\s+(.*)$");
+ static final SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd");
+ static final SimpleDateFormat format = new SimpleDateFormat("dd-MMM-yyyy", Locale.ENGLISH);
+
+ // static AddressCountryDict dict = AddressCountryDict.getInstance();
+
+ public static HashMap processSingleSequence(RichSequence seq) throws ParseException {
+ //try{
+ // logger.info("doc: " + seq.getAccession());
+
+ HashMap map = new HashMap() ;
+
+
+ map.put("Sequence", seq.seqString());
+ map.put("Accession", seq.getAccession());
+
+ map.put("SequenceLength", seq.getInternalSymbolList().length());
+ if (seq.getTaxon() != null) {
+ map.put("TaxonID", seq.getTaxon().getNCBITaxID());
+ map.put("Organism", seq.getTaxon().getDisplayName());
+ }
+ map.put("Description", seq.getDescription().replace('\n', ' '));
+
+ map.put("Division", seq.getDivision());
+ map.put("Identifier", seq.getIdentifier());
+ map.put("Version", seq.getVersion());
+
+ if (seq.getCircular()) {
+ map.put("Topology", "Circular");
+ } else {
+ map.put("Topology", "Linear");
+ }
+
+
+ for (Note note : seq.getNoteSet()) {
+ String noteName = note.getTerm().toString().substring(9);
+ if (noteName.indexOf("moltype") != -1) {
+ map.put("MoleculeType", note.getValue());
+ } else if (noteName.indexOf("Organism") != -1) {
+ String organism = note.getValue();
+ //doc.put("Organism", organism.substring(0,organism.indexOf("\n")));
+ map.put("Lineage", organism.substring(organism.indexOf("\n")).replaceAll("\n", ""));
+ } else if (noteName.indexOf("acc") != -1) {
+ map.put("AdditionalAccs", note.getValue());
+ } else if (noteName.indexOf("DBLink") != -1) { //deal with dblinks
+ JSONArray dbLinks = new JSONArray();
+ String[] val = note.getValue().split("\\n");
+ for (String v : val) {
+ int index = v.indexOf(":");
+ if (index != -1) {
+ JSONObject link = new JSONObject();
+ link.put(v.substring(0, index), v.substring(index + 1).trim());
+ dbLinks.put(link);
+ } else { // value splitted into more than one line
+ JSONObject last = dbLinks.getJSONObject(dbLinks.length() - 1);
+ String key = last.keys().next();
+ String value = last.get(key).toString();
+ String newVal = value + v;
+ last.put(key, newVal);
+ }
+ }
+ map.put("dbLinks", dbLinks);
+ } else if (noteName.equals("kw")) {
+ map.put("KeyWords", note.getValue());
+ } else if (noteName.equals("udat")) {
+ map.put("dateUpdated", formatter.format(format.parse(note.getValue())));
+ } else {
+ map.put(noteName, note.getValue());
+ }
+ }
+
+ //features
+ JSONArray featureArray = new JSONArray();
+ Iterator featureIterator = seq.features();
+ List isolates = new ArrayList();
+ while (featureIterator.hasNext()) {
+ JSONObject featureObject = new JSONObject();
+ List dbxrefArray = new ArrayList();
+ RichFeature feature = (RichFeature) featureIterator.next();
+ for (RankedCrossRef rankedCrossRef : feature.getRankedCrossRefs()) {
+ dbxrefArray.add(rankedCrossRef.getCrossRef().getDbname() + ":" + rankedCrossRef.getCrossRef().getAccession());
+ }
+ featureObject.put("db_xref", dbxrefArray);
+
+ featureObject.put("featureType", feature.getType());
+ Map featureMap = feature.getAnnotation().asMap();
+ Iterator featureKeyIterator = featureMap.keySet().iterator();
+ while (featureKeyIterator.hasNext()) {
+ SimpleComparableTerm term = featureKeyIterator.next();
+ String name = term.getName();
+ String nameValue = featureMap.get(term).toString();
+ //isolate is an array?
+
+ if (name.indexOf("altitude") != -1) {
+ featureObject.put("altitude_value", Float.valueOf(nameValue.substring(0, nameValue.indexOf(" ")))); //number, take care of negative number
+ } else if (name.indexOf("collection_date") != -1) {
+ if (getCollectionYear(nameValue) != 0) {
+ featureObject.put("collection_year", getCollectionYear(nameValue));
+ }
+ } else if (name.indexOf("country") != -1) {
+ if (nameValue.indexOf(":") != -1) {
+ featureObject.put("CollectionCountry", nameValue.substring(0, nameValue.indexOf(":")));
+ }
+ } else if (name.indexOf("culture_collection") != -1) {
+ int index = nameValue.indexOf(":") != -1 ? nameValue.indexOf(":") : nameValue.indexOf(" ");
+ if (index != -1) {
+ featureObject.put("InstitutionCode", nameValue.substring(0, index));
+ featureObject.put("CultureID", nameValue.substring(index + 1));
+ }
+ } else if (name.indexOf("lat_lon") != -1) {
+ Float[] arr = getLat_Lon(nameValue);
+ if (arr != null) {
+ featureObject.put("Latitude", arr[0]);
+ featureObject.put("Longitude", arr[1]);
+ }
+ } else if (name.indexOf("pathovar") != -1) {
+
+ } else if (feature.getType().equals("source") && name.equals("isolate")) {
+ isolates.add(nameValue);
+ }
+ featureObject.put(term.getName(), featureMap.get(term));
+ }
+ featureArray.put(featureObject);
+ //for garbage collection
+ featureObject = null;
+ dbxrefArray = null;
+ feature = null;
+ featureMap = null;
+ }
+ map.put("features", featureArray);
+ if (isolates.size() > 0) {
+ map.put("isolate_all", isolates);
+ }
+ return map;
+ }
+
+ public static int getCollectionYear(String date){
+ Matcher m = dp.matcher(date);
+ String year;
+ if(m.find()){
+ year = m.group(1);
+ return Integer.parseInt(year);
+ }else{
+ return 0;
+ }
+ }
+
+ public static Float[] getLat_Lon(String lat_lon){
+ Matcher m = llp.matcher(lat_lon);
+ Float[] array = null;
+ try{
+ if(m.matches()){
+ array = new Float[2];
+ if(m.group(2).equals("N")){
+ array[0] = Float.valueOf(m.group(1));
+ }else{
+ array[0] = Float.valueOf("0")-Float.valueOf(m.group(1));
+ }
+ if(m.group(4).equals("E")){
+ array[1] = Float.valueOf(m.group(3));
+ }else{
+ array[1] = Float.valueOf("0")-Float.valueOf(m.group(3));
+ }
+ }
+ }catch (NumberFormatException nfe){
+ return null;
+ }
+ return array;
+ }
+
+ public static void processUniprotSeq(RichSequence seq, JSONObject doc) throws ParseException {
+ logger.info("doc: " + seq.getAccession());
+ doc.put("Accession", seq.getAccession());
+ doc.put("Name", seq.getName());
+ doc.put("Division", seq.getDivision());
+ doc.put("Description", seq.getDescription().replace('\n', ' '));
+ doc.put("Version", seq.getVersion());
+ doc.put("sequencelength", seq.length());
+ //Taxon
+ doc.put("TaxonID", seq.getTaxon().getNCBITaxID());
+ for(Object name: seq.getTaxon().getNameClasses()){
+ doc.put("Taxon_"+(String)name, seq.getTaxon().getNames((String)name));
+ }
+
+ //rankedcrossrefs
+ /*JSONArray rankedCrossRefs = new JSONArray();
+ for(RankedCrossRef rankedCrossRef : seq.getRankedCrossRefs()){
+ JSONObject ref = new JSONObject();
+ String key = rankedCrossRef.getCrossRef().getDbname();
+ String accessions = rankedCrossRef.getCrossRef().getAccession();
+ for(Note note : rankedCrossRef.getCrossRef().getRichAnnotation().getNoteSet()){
+ accessions += ";"+note.getValue();
+ }
+ ref.put(key, accessions);
+ rankedCrossRefs.put(ref);
+ }
+ if(rankedCrossRefs.length() > 0){
+ doc.put("rankedCrossRefs", rankedCrossRefs);
+ }*/
+ processRankedCrossRefs(seq, doc);
+ //comments
+ JSONArray comments = new JSONArray();
+ for(Comment comment : seq.getComments()){
+ JSONObject cmtObj = new JSONObject();
+ String cmt = comment.getComment().replace('\n', ' ');
+ cmt = cmt.substring(3);
+ int index = cmt.indexOf(":");
+ cmtObj.put(cmt.substring(0,index).trim(),cmt.substring(index+1).trim());
+ comments.put(cmtObj);
+ }
+ if(comments.length() > 0){
+ doc.put("comments", comments);
+ }
+ //features
+ JSONArray features = new JSONArray();
+ Iterator featureIterator = seq.features();
+ while(featureIterator.hasNext()){
+ JSONObject featureObject = new JSONObject();
+ List dbxrefArray = new ArrayList();
+ RichFeature feature = (RichFeature)featureIterator.next();
+ for(RankedCrossRef rankedCrossRef : feature.getRankedCrossRefs()){
+ dbxrefArray.add(rankedCrossRef.getCrossRef().getDbname() + ":" + rankedCrossRef.getCrossRef().getAccession());
+ }
+ if(dbxrefArray.size() > 0){
+ featureObject.put("rankedCrossRefs", dbxrefArray);
+ }
+ featureObject.put("type", feature.getType());
+ featureObject.put("location_start", feature.getLocation().getMin());
+ featureObject.put("location_end", feature.getLocation().getMax());
+ Map featureMap = feature.getAnnotation().asMap();
+ Iterator featureKeyIterator = featureMap.keySet().iterator();
+ while(featureKeyIterator.hasNext()){
+ SimpleComparableTerm term = featureKeyIterator.next();
+ featureObject.put(term.getName(),featureMap.get(term));
+ }
+ features.put(featureObject);
+ }
+ if(features.length() > 0){
+ doc.put("features", features);
+ }
+ //sequence
+ doc.put("sequence", seq.seqString());
+
+ JSONArray rankedDocRefs = new JSONArray();
+ Map> rankedDocRefs_addiInfo = new HashMap>();
+ //properties from notes: rlistener.addSequenceProperty
+ List keywords = new ArrayList();
+ List secondaryAccs = new ArrayList();
+ JSONArray organismHosts = new JSONArray();
+ for(Note note : seq.getNoteSet()){
+ String note_term = note.getTerm().getName();
+ if(note_term.equals("kw")){
+ keywords.add(note.getValue());
+ }else if(note_term.equals("cdat")){
+ doc.put("dateCreated", formatter.format(format.parse(note.getValue())));
+ }else if(note_term.equals("udat")){
+ doc.put("dateUpdated", formatter.format(format.parse(note.getValue())));
+ }else if(note_term.equals("adat")){
+ doc.put("dateAnnotated", formatter.format(format.parse(note.getValue())));
+ }else if(note_term.equals("arel")){
+ doc.put("relAnnotated", note.getValue());
+ }else if(note_term.equals("Organism host")){
+ JSONObject organismHost = new JSONObject();
+ String sciname;
+ String comname;
+ String names = null;
+ List synonym = new ArrayList();
+ String[] parts = note.getValue().split(";");
+ if(parts[0].matches("\\S+=\\S+")){
+ String[] moreparts = parts[0].split("=");
+ if(moreparts[0].equals("NCBI_TaxID")){
+ organismHost.put("NCBI_TaxID",Integer.parseInt(moreparts[1]));
+ }else{
+ organismHost.put(moreparts[0],moreparts[1]);
+ }
+ }else{
+ names = parts[0];
+ }
+ if(parts.length > 1){
+ names = parts[1];
+ }
+ if(names != null){
+ if (names.endsWith(".")) names = names.substring(0,names.length()-1); // chomp trailing dot
+ String[] nameparts = names.split("\\(");
+ sciname = nameparts[0].trim();
+ organismHost.put("scientific name", sciname);
+ if (nameparts.length>1) {
+ comname = nameparts[1].trim();
+ if (comname.endsWith(")")) comname = comname.substring(0,comname.length()-1); // chomp trailing bracket
+ organismHost.put("common name", comname);
+ if (nameparts.length>2) {
+ // synonyms
+ for (int j = 2 ; j < nameparts.length; j++) {
+ String syn = nameparts[j].trim();
+ if (syn.endsWith(")")) syn = syn.substring(0,syn.length()-1); // chomp trailing bracket
+ synonym.add(syn);
+ }
+ organismHost.put("synonym", synonym);
+ }
+ }
+ }
+ organismHosts.put(organismHost);
+ }else if(note_term.equals("Sequence meta info")){
+ String seqMetaInfo = note.getValue();
+ if(seqMetaInfo.startsWith("SEQUENCE")){
+ seqMetaInfo = seqMetaInfo.substring(8);
+ }
+ String[] parts = seqMetaInfo.split(";");
+ if(parts.length > 1){
+ doc.put("molecular weight", Integer.parseInt(parts[1].trim().split(" ")[0]));
+ if(parts.length > 2){
+ String[] moreparts = parts[2].trim().split(" ");
+ doc.put(moreparts[1], moreparts[0]);
+ }
+ }
+ }else if(note_term.startsWith("docref")){
+ int rank = Integer.parseInt(note.getValue().split(":")[0].trim());
+ String key = note_term.substring(7); //remove the precedding "docref_"
+ if(key.contains("biojavax:")){
+ key = key.substring(9); //remove "biojavax:"
+ }
+ String value = note.getValue().substring(note.getValue().indexOf(":")+1).trim();
+ if(rankedDocRefs_addiInfo.containsKey(rank)){
+ rankedDocRefs_addiInfo.get(rank).add(key+":"+value);
+ }else{
+ List tmp = new ArrayList();
+ tmp.add( key+":"+value);
+ rankedDocRefs_addiInfo.put(rank,tmp);
+ }
+ }else if(note_term.equals("acc")){
+ secondaryAccs.add(note.getValue());
+ }else{
+ doc.put(note_term, note.getValue());
+ }
+ }
+ if(secondaryAccs.size() > 0){
+ doc.put("secondaryacc",secondaryAccs);
+ }
+ if(organismHosts.length() > 0){
+ doc.put("organismhost", organismHosts);
+ }
+ if(keywords.size() > 0){
+ doc.put("keywords", keywords);
+ }
+
+ //rankeddocref
+ for(RankedDocRef rankedDocRef : seq.getRankedDocRefs()){
+ JSONObject rankedDocRefObj = new JSONObject();
+ DocRef docRef = rankedDocRef.getDocumentReference();
+ rankedDocRefObj.put("rank", rankedDocRef.getRank());
+ rankedDocRefObj.put("authors", docRef.getAuthors());
+ rankedDocRefObj.put("title", docRef.getTitle());
+ rankedDocRefObj.put("location", docRef.getLocation());
+ rankedDocRefObj.put("remark", docRef.getRemark());
+ for(Map.Entry entry : rankedDocRefs_addiInfo.entrySet()){
+ if((Integer)(entry.getKey()) == rankedDocRef.getRank()){
+ for(String pair : (List)(entry.getValue())){
+ int index = pair.indexOf(":");
+ rankedDocRefObj.put(pair.substring(0, index),pair.substring(index+1));
+ }
+ }
+ }
+ rankedDocRefs.put(rankedDocRefObj);
+ }
+ if(rankedDocRefs.length() > 0){
+ doc.put("rankedDocRefs", rankedDocRefs);
+ }
+ }
+
+ public static void processEMBL_EnsemblSeq(RichSequence seq,JSONObject doc) throws ParseException {
+ logger.info("accession: " + seq.getName());
+ if(seq.getCircular()){
+ doc.put("Topology", "Circular");
+ }else{
+ doc.put("Topology", "Linear");
+ }
+ for(Note note : seq.getNoteSet()){
+ String noteName = note.getTerm().toString().substring(9);
+ if(noteName.equals("moltype")){
+ doc.put("Molecule type", note.getValue());
+ }else if(noteName.equals("organism")){
+ doc.put("Classfication", note.getValue().replaceAll("\n", ""));
+ }else if(noteName.equals("kw")){
+ doc.put("KeyWords", note.getValue());
+ }else if(noteName.equals("udat")){
+ doc.put("dateUpdated", formatter.format(format.parse(note.getValue())));
+ }else if(noteName.equals("cdat")){
+ doc.put("dateCreated", formatter.format(format.parse(note.getValue())));
+ }else{
+ doc.put(noteName, note.getValue());
+ }
+ }
+ doc.put("SequenceLength", seq.getInternalSymbolList().length());
+ doc.put("Description", seq.getDescription().replace('\n', ' '));
+ //System.out.println(seq.getInternalSymbolList().length());
+ //doc.put("Sequence length", seq.getInternalSymbolList().length());
+ doc.put("Accession", seq.getName());
+ doc.put("Organism",seq.getTaxon().getDisplayName());
+ doc.put("TaxonID", seq.getTaxon().getNCBITaxID());
+
+ /*for (RankedDocRef rankDocRef : seq.getRankedDocRefs()){
+ if(rankDocRef.getDocumentReference().getLocation().indexOf("Submitted") != -1){
+ int dotindex = rankDocRef.getDocumentReference().getLocation().indexOf(".");
+ String submitDate = rankDocRef.getDocumentReference().getLocation().substring(11,22);
+ String submitAddress = rankDocRef.getDocumentReference().getLocation().substring(dotindex+1).trim();
+ doc.put("SubmitDate", format.parse(submitDate));
+ doc.put("SubmittedAddress", rankDocRef.getDocumentReference().getLocation().substring(dotindex+1).trim());
+ }
+ }*/
+ //rankedDocRefs
+ //processRankedDocRefs(seq, doc);
+
+ //rankedCrossRef
+ processRankedCrossRefs(seq, doc);
+
+ //comments
+ processComment(seq, doc);
+
+ //features
+ JSONArray featureArray = new JSONArray();
+ Iterator featureIterator = seq.features();
+ while (featureIterator.hasNext()){
+ JSONObject featureObject = new JSONObject();
+ List dbxrefArray = new ArrayList();
+ RichFeature feature = (RichFeature)featureIterator.next();
+ //deal with db_xref in each feature
+ //db_xref is not required in the requirement
+ for(RankedCrossRef rankedCrossRef : feature.getRankedCrossRefs()){
+ dbxrefArray.add(rankedCrossRef.getCrossRef().getDbname() + ":" + rankedCrossRef.getCrossRef().getAccession());
+ }
+ featureObject.put("db_xref", dbxrefArray);
+
+ featureObject.put("featureType", feature.getType());
+ Map featureMap = feature.getAnnotation().asMap();
+ Iterator featureKeyIterator = featureMap.keySet().iterator();
+ while(featureKeyIterator.hasNext()){
+ SimpleComparableTerm term = featureKeyIterator.next();
+ String name = term.getName();
+ String nameValue = featureMap.get(term).toString();
+
+ if(name.equals("altitude")){
+ featureObject.put("altitude_value", Float.valueOf(nameValue.substring(0,nameValue.indexOf("m")).trim())); //number, take care of negative number
+ }else if(name.equals("collection_date")){
+ JSONArray collectionDates = new JSONArray();
+ for(String singleDate : nameValue.split("/")){
+ JSONObject collectionDate = new JSONObject();
+ if(singleDate.endsWith("FT")){
+ singleDate = singleDate.substring(0, singleDate.length()-2);
+ }
+ if(singleDate.matches("\\d{2}-\\w{3}-\\d{4}")){
+ collectionDate.put("collection_date", formatter.format(format.parse(singleDate)));
+ }else{
+ collectionDate.put("collection_date", singleDate);
+ }
+
+ collectionDate.put("collection_year", getCollectionYear(singleDate));
+ collectionDates.put(collectionDate);
+ }
+ featureObject.put("collectionDate", collectionDates);
+ }
+ featureObject.put(term.getName(),featureMap.get(term));
+ }
+ featureArray.put(featureObject);
+ }
+ doc.put("features", featureArray);
+ }
+
+ public static void processRankedCrossRefs(RichSequence seq, JSONObject doc){
+ JSONArray rankedCrossRefs = new JSONArray();
+ for(RankedCrossRef rankedCrossRef : seq.getRankedCrossRefs()){
+ JSONObject ref = new JSONObject();
+ String key = rankedCrossRef.getCrossRef().getDbname();
+ String accessions = rankedCrossRef.getCrossRef().getAccession();
+ for(Note note : rankedCrossRef.getCrossRef().getRichAnnotation().getNoteSet()){
+ accessions += ";"+note.getValue();
+ }
+ ref.put(key, accessions);
+ rankedCrossRefs.put(ref);
+ }
+ if(rankedCrossRefs.length() > 0){
+ doc.put("rankedCrossRefs", rankedCrossRefs);
+ }
+ }
+
+// public static void processRankedDocRefs(RichSequence seq, JSONObject doc) throws ParseException {
+// JSONArray rankedDocRefs = new JSONArray();
+// for(RankedDocRef rankedDocRef : seq.getRankedDocRefs()){
+// DocRef docRef = rankedDocRef.getDocumentReference();
+// JSONObject rankedRef = new JSONObject();
+// rankedRef.put("authors", docRef.getAuthors());
+// rankedRef.put("title", docRef.getTitle());
+// if(docRef.getCrossref() != null){
+// String dbName = docRef.getCrossref().getDbname();
+// if(dbName.equals("PUBMED")){
+// rankedRef.put(dbName, Integer.parseInt(docRef.getCrossref().getAccession()));
+// }else{
+// rankedRef.put(dbName, docRef.getCrossref().getAccession());
+// }
+// }
+// Matcher m = submitDatep.matcher(docRef.getLocation().replaceAll("\n", " "));
+// if(m.matches()){
+// rankedRef.put("SubmitDate", formatter.format(format.parse(m.group(1))));
+// rankedRef.put("SubmitAddress", m.group(2));
+// int year = Integer.parseInt(m.group(1).substring(m.group(1).lastIndexOf("-")+1));
+// rankedRef.put("SubmitYear", year);
+// //submitCountry--extract from SubmitAddress
+// String countryName = dict.mappingCountry(m.group(2));
+// if(countryName != null){
+// rankedRef.put("SubmitCountry", countryName);
+// }
+// }
+// rankedDocRefs.put(rankedRef);
+// }
+// doc.put("rankedDocRefs", rankedDocRefs);
+// }
+
+ public static void processComment(RichSequence seq, JSONObject doc){
+ Map commentMetaData = new HashMap();
+ JSONArray comments = new JSONArray();
+ for(Comment comment: seq.getComments()){
+ JSONObject commentObj = new JSONObject();
+ if(comment.getComment().indexOf("::") != -1){
+ String comm[] = comment.getComment().split("\n");
+ for(int i = 0; i < comm.length; i++){
+ if(comm[i].matches("(.*)\\s+::\\s+(.*)")){
+ String[] metaData = comm[i].split("::");
+ String key = metaData[0].trim();
+ String value = metaData[1].trim();
+ if(key.contains(".")){
+ key = key.replaceAll("\\.", " ");
+ }
+ commentMetaData.put(key, value);
+ }
+ }
+ commentObj.put("commentMeta", commentMetaData);
+ }else{
+ commentObj.put("comment", comment.getComment());
+ }
+ comments.put(commentObj);
+ }
+ doc.put("comments", comments);
+ }
+}
diff --git a/piflow-bundle/src/test/scala/cn/piflow/bundle/ftp/emblTest.scala b/piflow-bundle/src/test/scala/cn/piflow/bundle/ftp/emblTest.scala
new file mode 100644
index 0000000..e4d1626
--- /dev/null
+++ b/piflow-bundle/src/test/scala/cn/piflow/bundle/ftp/emblTest.scala
@@ -0,0 +1,87 @@
+package cn.piflow.bundle.ftp
+
+import cn.piflow.Runner
+import cn.piflow.conf.bean.FlowBean
+import cn.piflow.conf.util.{FileUtil, OptionUtil}
+import org.apache.spark.sql.SparkSession
+import org.h2.tools.Server
+import org.jsoup.Jsoup
+import org.jsoup.select.Elements
+import org.junit.Test
+
+import scala.util.parsing.json.JSON
+
+class emblTest {
+
+ @Test
+ def testEmblDataParse(): Unit ={
+
+ //parse flow json
+// val file = "src/main/resources/yqd/down.json"
+//val file = "src/main/resources/yqd/refseq_genome.json"
+//val file = "src/main/resources/yqd/select_unzip.json"
+val file = "src/main/resources/yqd/embl_parser.json"
+
+ val flowJsonStr = FileUtil.fileReader(file)
+
+ val map = OptionUtil.getAny(JSON.parseFull(flowJsonStr)).asInstanceOf[Map[String, Any]]
+ println(map)
+
+ //create flow
+ val flowBean = FlowBean(map)
+ val flow = flowBean.constructFlow()
+
+ val h2Server = Server.createTcpServer("-tcp", "-tcpAllowOthers", "-tcpPort","50001").start()
+ //execute flow
+ val spark = SparkSession.builder()
+ .master("spark://10.0.88.70:7077")
+ .appName("Embl")
+ .config("spark.driver.memory", "8g")
+ .config("spark.executor.memory", "16g")
+ .config("spark.cores.max", "16")
+ .config("spark.jars","/root/Desktop/weishengwu/out/artifacts/piflow_bundle/piflow_bundle.jar")
+ .enableHiveSupport()
+ .getOrCreate()
+
+ val process = Runner.create()
+ .bind(classOf[SparkSession].getName, spark)
+ .bind("checkpoint.path", "hdfs://10.0.86.89:9000/xjzhu/piflow/checkpoints/")
+ .start(flow);
+
+ process.awaitTermination();
+ val pid = process.pid();
+ println(pid + "!!!!!!!!!!!!!!!!!!!!!")
+ spark.close();
+ }
+
+
+ @Test
+ def testEmblDataParse11(): Unit ={
+
+ val url ="http://ftp.ebi.ac.uk/pub/databases/ena/sequence/release/"
+ val doc = Jsoup.connect(url).timeout(100000000).get()
+ // 获取 url 界面 文件名字 日期 大小
+ // Name Last modified Size Parent Directory -
+ // build_gbff_cu.pl 2003-04-25 17:23 21K
+
+ val elements: Elements = doc.select("html >body >table >tbody")
+// println(elements)
+ println(elements.first().text())
+
+ // 按行 分割 elements 为单个字符串
+ val fileString = elements.first().text().split("\\n")
+
+
+ for (i <- 0 until fileString.size) {
+
+ println(fileString(i))
+ }
+
+ println(fileString)
+ }
+
+
+
+
+
+}