Merge remote-tracking branch 'origin/master'
|
@ -6,28 +6,41 @@
|
|||
{
|
||||
"uuid":"1111",
|
||||
"name":"SelectFilesByName",
|
||||
"bundle":"cn.piflow.bundle.ftp.SelectFilesByName",
|
||||
"bundle":"cn.piflow.bundle.hdfs.SelectFilesByName",
|
||||
"properties":{
|
||||
"HDFSUrl":"hdfs://10.0.88.70:9000",
|
||||
"HDFSPath":"/yqd/weishengwu/PDB/",
|
||||
"HDFSPath":"/yqd/ftp/pdb/07/",
|
||||
"selectionConditions":".*.ent.gz"
|
||||
}
|
||||
},{
|
||||
"uuid":"2222",
|
||||
"name":"UnzipFilesOnHDFS",
|
||||
"bundle":"cn.piflow.bundle.http.UnzipFilesOnHDFS",
|
||||
"bundle":"cn.piflow.bundle.hdfs.UnzipFilesOnHDFS",
|
||||
"properties":{
|
||||
"isCustomize":"false",
|
||||
"hdfsUrl":"hdfs://10.0.88.70:9000",
|
||||
"filePath":"/yqd/weishengwu/PDB/",
|
||||
"savePath":""
|
||||
"filePath":"",
|
||||
"savePath":"/yqd/test/pdb/pdb1/"
|
||||
}
|
||||
},{
|
||||
"uuid":"3333",
|
||||
"name":"PDBParser",
|
||||
"bundle":"cn.piflow.bundle.microorganism.PDBParser",
|
||||
"bundle":"cn.piflow.bundle.microorganism.PDBData",
|
||||
"properties":{
|
||||
"cachePath":"/yqd/test/pdb/"
|
||||
}
|
||||
},
|
||||
{
|
||||
"uuid":"4444",
|
||||
"name":"putEs",
|
||||
"bundle":"cn.piflow.bundle.es.PutEs",
|
||||
"properties":{
|
||||
"es_nodes": "10.0.88.70,10.0.88.71,10.0.88.72",
|
||||
"es_port": "9200",
|
||||
"es_index": "test_17",
|
||||
"es_type": "test_17"
|
||||
}
|
||||
|
||||
}
|
||||
],
|
||||
"paths":[
|
||||
|
@ -41,6 +54,11 @@
|
|||
"outport":"",
|
||||
"inport":"",
|
||||
"to":"PDBParser"
|
||||
},{
|
||||
"from":"PDBParser",
|
||||
"outport":"",
|
||||
"inport":"",
|
||||
"to":"putEs"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
|
@ -7,30 +7,30 @@
|
|||
{
|
||||
"uuid":"0000",
|
||||
"name":"SelectFilesByName",
|
||||
"bundle":"cn.piflow.bundle.ftp.SelectFilesByName",
|
||||
"bundle":"cn.piflow.bundle.hdfs.SelectFilesByName",
|
||||
"properties":{
|
||||
"HDFSUrl":"hdfs://master2.packone:8020",
|
||||
"HDFSPath":"/microo/",
|
||||
"HDFSPath":"/yg/microo/",
|
||||
"selectionConditions":".*ample_set.xml.gz"
|
||||
}
|
||||
},{
|
||||
"uuid":"2222",
|
||||
"name":"UnzipFilesOnHDFS",
|
||||
"bundle":"cn.piflow.bundle.http.UnzipFilesOnHDFS",
|
||||
"bundle":"cn.piflow.bundle.hdfs.UnzipFilesOnHDFS",
|
||||
"properties":{
|
||||
"isCustomize":"false",
|
||||
"filePath":"",
|
||||
"hdfsUrl":"hdfs://master2.packone:8020",
|
||||
"savePath":"/microo/biosample/biosample/"
|
||||
"savePath":"/yg/microo/biosample/"
|
||||
|
||||
}
|
||||
},
|
||||
{
|
||||
"uuid":"2222",
|
||||
"name":"BioSampleParse",
|
||||
"bundle":"cn.piflow.bundle.microorganism.BioSampleParse",
|
||||
"bundle":"cn.piflow.bundle.microorganism.BioSample",
|
||||
"properties":{
|
||||
|
||||
"cachePath": "/yg/microoCache/"
|
||||
}
|
||||
},
|
||||
{
|
||||
|
@ -38,10 +38,10 @@
|
|||
"name":"putEs",
|
||||
"bundle":"cn.piflow.bundle.es.PutEs",
|
||||
"properties":{
|
||||
"es_nodes":"10.0.88.70,10.0.88.71,10.0.88.72",
|
||||
"port":"9200",
|
||||
"es_index":"sample0122",
|
||||
"es_type":"sample0122"
|
||||
"es_nodes": "10.0.88.70,10.0.88.71,10.0.88.72",
|
||||
"es_port": "9200",
|
||||
"es_index": "test_11",
|
||||
"es_type": "test_11"
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -6,22 +6,19 @@
|
|||
{
|
||||
"uuid":"1111",
|
||||
"name":"SelectFilesByName",
|
||||
"bundle":"cn.piflow.bundle.ftp.SelectFilesByName",
|
||||
"bundle":"cn.piflow.bundle.hdfs.SelectFilesByName",
|
||||
"properties":{
|
||||
"HDFSUrl":"hdfs://master2.packone:8020",
|
||||
"HDFSPath":"/microo/biproject/",
|
||||
"HDFSPath":"/yg/microo/biproject/",
|
||||
"selectionConditions":"bioproject.xml"
|
||||
}
|
||||
},
|
||||
{
|
||||
"uuid":"2222",
|
||||
"name":"BioProjetDataParse",
|
||||
"bundle":"cn.piflow.bundle.microorganism.BioProjetDataParse",
|
||||
"bundle":"cn.piflow.bundle.microorganism.BioProjetData",
|
||||
"properties":{
|
||||
"es_nodes": "10.0.88.70,10.0.88.71,10.0.88.72",
|
||||
"port": "9200",
|
||||
"es_index": "bioproject",
|
||||
"es_type": "bioprojecttest002"
|
||||
"cachePath": "/yg/microoCache/"
|
||||
}
|
||||
},
|
||||
{
|
||||
|
@ -30,9 +27,9 @@
|
|||
"bundle": "cn.piflow.bundle.es.PutEs",
|
||||
"properties": {
|
||||
"es_nodes": "10.0.88.70,10.0.88.71,10.0.88.72",
|
||||
"port": "9200",
|
||||
"es_index": "bioproject10",
|
||||
"es_type": "bioproject10"
|
||||
"es_port": "9200",
|
||||
"es_index": "test_10",
|
||||
"es_type": "test_10"
|
||||
}
|
||||
}
|
||||
],
|
|
@ -1,31 +0,0 @@
|
|||
{
|
||||
"flow":{
|
||||
"name":"test",
|
||||
"uuid":"1234",
|
||||
"stops":[
|
||||
{
|
||||
"uuid":"1111",
|
||||
"name":"LoadFromFtpToHDFS",
|
||||
"bundle":"cn.piflow.bundle.ftp.LoadFromFtpToHDFS",
|
||||
"properties":{
|
||||
"url_str":"ftp.ebi.ac.uk",
|
||||
"port":"",
|
||||
"username":"",
|
||||
"password":"",
|
||||
"ftpFile":"/pub/databases/ena/sequence/release/con/rel_con_env_07_r138.dat.gz",
|
||||
"HDFSUrl":"hdfs://10.0.88.70:9000",
|
||||
"HDFSPath":"/yqd/weishengwu/embl/",
|
||||
"isFile":"true"
|
||||
}
|
||||
}
|
||||
],
|
||||
"paths":[
|
||||
{
|
||||
"from":"",
|
||||
"outport":"",
|
||||
"inport":"",
|
||||
"to":""
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
|
@ -7,29 +7,30 @@
|
|||
{
|
||||
"uuid":"1111",
|
||||
"name":"SelectFilesByName",
|
||||
"bundle":"cn.piflow.bundle.ftp.SelectFilesByName",
|
||||
"bundle":"cn.piflow.bundle.hdfs.SelectFilesByName",
|
||||
"properties":{
|
||||
"HDFSUrl":"hdfs://10.0.88.70:9000",
|
||||
"HDFSPath":"/yqd/weishengwu/embl",
|
||||
"selectionConditions":".*con_pro_02_r138.dat.gz,.*con_vrl_01_r138.dat.gz,.*pat_phg_01_r138.dat.gz"
|
||||
"HDFSPath":"/yqd/ftp/embl/",
|
||||
"selectionConditions":"rel_exp_con_pro_26_r138.dat.gz"
|
||||
}
|
||||
},{
|
||||
"uuid":"2222",
|
||||
"name":"UnzipFilesOnHDFS_1",
|
||||
"bundle":"cn.piflow.bundle.http.UnzipFilesOnHDFS_1",
|
||||
"name":"UnzipFilesOnHDFS",
|
||||
"bundle":"cn.piflow.bundle.hdfs.UnzipFilesOnHDFS",
|
||||
"properties":{
|
||||
"isCustomize":"false",
|
||||
"filePath":"",
|
||||
"fileType":"gz",
|
||||
"unzipPath":""
|
||||
"hdfsUrl":"hdfs://master2.packone:8020",
|
||||
"savePath":"/yqd/test/embl/"
|
||||
|
||||
}
|
||||
},
|
||||
{
|
||||
"uuid":"3333",
|
||||
"name":"EmblParser",
|
||||
"bundle":"cn.piflow.bundle.microorganism.EmblParser",
|
||||
"bundle":"cn.piflow.bundle.microorganism.EmblData",
|
||||
"properties":{
|
||||
"cachePath": "/yqd/test/embl/"
|
||||
}
|
||||
},{
|
||||
"uuid":"4444",
|
||||
|
@ -37,9 +38,9 @@
|
|||
"bundle":"cn.piflow.bundle.es.PutEs",
|
||||
"properties":{
|
||||
"es_nodes": "10.0.88.70,10.0.88.71,10.0.88.72",
|
||||
"port": "9200",
|
||||
"es_index": "embl",
|
||||
"es_type": "embl"
|
||||
"es_port": "9200",
|
||||
"es_index": "test_22",
|
||||
"es_type": "test_22"
|
||||
}
|
||||
}
|
||||
],
|
||||
|
@ -48,10 +49,10 @@
|
|||
"from":"SelectFilesByName",
|
||||
"outport":"",
|
||||
"inport":"",
|
||||
"to":"UnzipFilesOnHDFS_1"
|
||||
"to":"UnzipFilesOnHDFS"
|
||||
},
|
||||
{
|
||||
"from":"UnzipFilesOnHDFS_1",
|
||||
"from":"UnzipFilesOnHDFS",
|
||||
"outport":"",
|
||||
"inport":"",
|
||||
"to":"EmblParser"
|
|
@ -9,15 +9,26 @@
|
|||
"bundle":"cn.piflow.bundle.hdfs.SelectFilesByName",
|
||||
"properties":{
|
||||
"HDFSUrl":"hdfs://10.0.88.70:9000",
|
||||
"HDFSPath":"/yqd/ftp/ensembl_gff3/",
|
||||
"selectionConditions":".*.gff3"
|
||||
"HDFSPath":"/yqd/ftp/ensembl_gff3/aspergillus_niger/",
|
||||
"selectionConditions":".*.gff3.gz"
|
||||
}
|
||||
},{
|
||||
"uuid":"2222",
|
||||
"name":"UnzipFilesOnHDFS",
|
||||
"bundle":"cn.piflow.bundle.hdfs.UnzipFilesOnHDFS",
|
||||
"properties":{
|
||||
"isCustomize":"false",
|
||||
"filePath":"",
|
||||
"hdfsUrl":"hdfs://10.0.88.70:9000",
|
||||
"savePath":"/yqd/test/ensembl/"
|
||||
}
|
||||
},
|
||||
{
|
||||
"uuid":"3333",
|
||||
"name":"Ensembl_gff3Parser",
|
||||
"bundle":"cn.piflow.bundle.microorganism.Ensembl_gff3Parser",
|
||||
"bundle":"cn.piflow.bundle.microorganism.Ensembl",
|
||||
"properties":{
|
||||
|
||||
"cachePath":"/yqd/test/ensembl/"
|
||||
}
|
||||
},{
|
||||
"uuid":"4444",
|
||||
|
@ -25,9 +36,9 @@
|
|||
"bundle":"cn.piflow.bundle.es.PutEs",
|
||||
"properties":{
|
||||
"es_nodes": "10.0.88.70,10.0.88.71,10.0.88.72",
|
||||
"port": "9200",
|
||||
"es_index": "yqd_ensembl_index",
|
||||
"es_type": "yqd_ensembl_type"
|
||||
"es_port": "9200",
|
||||
"es_index": "test_21",
|
||||
"es_type": "test_21"
|
||||
}
|
||||
}
|
||||
],
|
||||
|
@ -36,6 +47,11 @@
|
|||
"from":"SelectFilesByName",
|
||||
"outport":"",
|
||||
"inport":"",
|
||||
"to":"UnzipFilesOnHDFS"
|
||||
},{
|
||||
"from":"UnzipFilesOnHDFS",
|
||||
"outport":"",
|
||||
"inport":"",
|
||||
"to":"Ensembl_gff3Parser"
|
||||
},{
|
||||
"from":"Ensembl_gff3Parser",
|
||||
|
|
|
@ -6,33 +6,30 @@
|
|||
{
|
||||
"uuid":"1111",
|
||||
"name":"SelectFilesByName",
|
||||
"bundle":"cn.piflow.bundle.ftp.SelectFilesByName",
|
||||
"bundle":"cn.piflow.bundle.hdfs.SelectFilesByName",
|
||||
"properties":{
|
||||
"HDFSUrl":"hdfs://master2.packone:8020",
|
||||
"HDFSPath":"/microo/genbank/",
|
||||
"selectionConditions":".*.seq.gz"
|
||||
"HDFSPath":"/yg/microo/genbank2/",
|
||||
"selectionConditions":"gbbct1.seq.gz"
|
||||
}
|
||||
},{
|
||||
"uuid":"2222",
|
||||
"name":"UnzipFilesOnHDFS",
|
||||
"bundle":"cn.piflow.bundle.http.UnzipFilesOnHDFS",
|
||||
"bundle":"cn.piflow.bundle.hdfs.UnzipFilesOnHDFS",
|
||||
"properties":{
|
||||
"isCustomize":"false",
|
||||
"filePath":"",
|
||||
"hdfsUrl":"hdfs://master2.packone:8020",
|
||||
"savePath":"/microo/genbank/"
|
||||
"savePath":"/yg/microo/genbank/"
|
||||
|
||||
}
|
||||
},
|
||||
{
|
||||
"uuid":"3333",
|
||||
"name":"GenBankParse",
|
||||
"bundle":"cn.piflow.bundle.microorganism.GenBankParse",
|
||||
"bundle":"cn.piflow.bundle.microorganism.GenBankData",
|
||||
"properties":{
|
||||
"es_nodes":"10.0.86.239",
|
||||
"port":"9200",
|
||||
"es_index":"genbank",
|
||||
"es_type":"data6"
|
||||
"cachePath": "/yg/microoCache/"
|
||||
}
|
||||
},
|
||||
{
|
||||
|
@ -41,9 +38,9 @@
|
|||
"bundle": "cn.piflow.bundle.es.PutEs",
|
||||
"properties": {
|
||||
"es_nodes": "10.0.88.70,10.0.88.71,10.0.88.72",
|
||||
"port": "9200",
|
||||
"es_index": "genbank",
|
||||
"es_type": "genbank1"
|
||||
"es_port": "9200",
|
||||
"es_index": "test_12",
|
||||
"es_type": "test_12"
|
||||
}
|
||||
}
|
||||
],
|
||||
|
|
|
@ -6,29 +6,43 @@
|
|||
{
|
||||
"uuid":"1111",
|
||||
"name":"SelectFilesByName",
|
||||
"bundle":"cn.piflow.bundle.ftp.SelectFilesByName",
|
||||
"bundle":"cn.piflow.bundle.hdfs.SelectFilesByName",
|
||||
"properties":{
|
||||
"HDFSUrl":"hdfs://10.0.88.70:9000",
|
||||
"HDFSPath":"/yqd/weishengwu/gene_w/",
|
||||
"selectionConditions":".*ene_info"
|
||||
"HDFSPath":"/yqd/ftp/gene/",
|
||||
"selectionConditions":"gene_info.gz"
|
||||
}
|
||||
},{
|
||||
"uuid":"3333",
|
||||
"name":"GeneParser",
|
||||
"bundle":"cn.piflow.bundle.microorganism.GeneParser",
|
||||
"uuid":"2222",
|
||||
"name":"UnzipFilesOnHDFS",
|
||||
"bundle":"cn.piflow.bundle.hdfs.UnzipFilesOnHDFS",
|
||||
"properties":{
|
||||
"isCustomize":"false",
|
||||
"filePath":"",
|
||||
"hdfsUrl":"hdfs://10.0.88.70:9000",
|
||||
"savePath":"/yqd/test/gene/"
|
||||
}
|
||||
},
|
||||
{
|
||||
"uuid":"3333",
|
||||
"name":"Gene",
|
||||
"bundle":"cn.piflow.bundle.microorganism.Gene",
|
||||
"properties":{
|
||||
"cachePath":"/yqd/test/gene/"
|
||||
|
||||
}
|
||||
},{
|
||||
},
|
||||
{
|
||||
"uuid":"4444",
|
||||
"name":"PutEs",
|
||||
"name":"putEs",
|
||||
"bundle":"cn.piflow.bundle.es.PutEs",
|
||||
"properties":{
|
||||
"es_nodes": "10.0.88.70,10.0.88.71,10.0.88.72",
|
||||
"port": "9200",
|
||||
"es_index": "gene_index",
|
||||
"es_type": "gene_type"
|
||||
"es_port": "9200",
|
||||
"es_index": "test_16",
|
||||
"es_type": "test_16"
|
||||
}
|
||||
|
||||
}
|
||||
],
|
||||
"paths":[
|
||||
|
@ -36,12 +50,17 @@
|
|||
"from":"SelectFilesByName",
|
||||
"outport":"",
|
||||
"inport":"",
|
||||
"to":"GeneParser"
|
||||
"to":"UnzipFilesOnHDFS"
|
||||
},{
|
||||
"from":"GeneParser",
|
||||
"from":"UnzipFilesOnHDFS",
|
||||
"outport":"",
|
||||
"inport":"",
|
||||
"to":"PutEs"
|
||||
"to":"Gene"
|
||||
},{
|
||||
"from":"Gene",
|
||||
"outport":"",
|
||||
"inport":"",
|
||||
"to":"putEs"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
|
@ -6,18 +6,19 @@
|
|||
{
|
||||
"uuid":"1111",
|
||||
"name":"SelectFilesByName",
|
||||
"bundle":"cn.piflow.bundle.ftp.SelectFilesByName",
|
||||
"bundle":"cn.piflow.bundle.hdfs.SelectFilesByName",
|
||||
"properties":{
|
||||
"HDFSUrl":"hdfs://master2.packone:8020",
|
||||
"HDFSPath":"/microoAll/",
|
||||
"HDFSPath":"/yg/microo/go/",
|
||||
"selectionConditions":"go.obo"
|
||||
}
|
||||
},
|
||||
{
|
||||
"uuid": "3333",
|
||||
"name": "GoDataParse",
|
||||
"bundle": "cn.piflow.bundle.microorganism.GoDataParse",
|
||||
"bundle": "cn.piflow.bundle.microorganism.GoData",
|
||||
"properties": {
|
||||
"cachePath": "/yg/microoCache/"
|
||||
}
|
||||
},
|
||||
{
|
||||
|
@ -26,9 +27,9 @@
|
|||
"bundle": "cn.piflow.bundle.es.PutEs",
|
||||
"properties": {
|
||||
"es_nodes": "10.0.88.70,10.0.88.71,10.0.88.72",
|
||||
"port": "9200",
|
||||
"es_index": "go",
|
||||
"es_type": "go"
|
||||
"es_port": "9200",
|
||||
"es_index": "test_13",
|
||||
"es_type": "test_13"
|
||||
}
|
||||
}
|
||||
],
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
"bundle":"cn.piflow.bundle.http.FileDownHDFS",
|
||||
"properties":{
|
||||
"url_str":"https://gold.jgi.doe.gov/download?mode=site_excel",
|
||||
"savePath":"hdfs://master2.packone:8020/microo/golddata/gold.xlsx"
|
||||
"savePath":"hdfs://master2.packone:8020/yg/microo/golddata/gold.xlsx"
|
||||
}
|
||||
},
|
||||
{
|
||||
|
@ -17,26 +17,7 @@
|
|||
"name": "ExcelParser",
|
||||
"bundle": "cn.piflow.bundle.excel.ExcelParser",
|
||||
"properties": {
|
||||
"jaonSavePath":"hdfs://master2.packone:8020/microo/golddata/golddata.json"
|
||||
}
|
||||
},
|
||||
{
|
||||
"uuid": "2222",
|
||||
"name": "GoldDataParse",
|
||||
"bundle": "cn.piflow.bundle.microorganism.GoldDataParse",
|
||||
"properties": {
|
||||
|
||||
}
|
||||
},
|
||||
{
|
||||
"uuid": "3333",
|
||||
"name": "putEs",
|
||||
"bundle": "cn.piflow.bundle.es.PutEs",
|
||||
"properties": {
|
||||
"es_nodes": "10.0.88.70,10.0.88.71,10.0.88.72",
|
||||
"port": "9200",
|
||||
"es_index": "golddata1",
|
||||
"es_type": "golddatadaa"
|
||||
"cachePath":"hdfs://master2.packone:8020/yg/microo/golddata/golddata.json"
|
||||
}
|
||||
}
|
||||
],
|
||||
|
@ -46,18 +27,6 @@
|
|||
"outport":"",
|
||||
"inport":"",
|
||||
"to":"ExcelParser"
|
||||
},
|
||||
{
|
||||
"from":"ExcelParser",
|
||||
"outport":"",
|
||||
"inport":"",
|
||||
"to":"GoldDataParse"
|
||||
},
|
||||
{
|
||||
"from":"GoldDataParse",
|
||||
"outport":"",
|
||||
"inport":"",
|
||||
"to":"putEs"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
|
@ -6,29 +6,30 @@
|
|||
{
|
||||
"uuid":"1111",
|
||||
"name":"SelectFilesByName",
|
||||
"bundle":"cn.piflow.bundle.ftp.SelectFilesByName",
|
||||
"bundle":"cn.piflow.bundle.hdfs.SelectFilesByName",
|
||||
"properties":{
|
||||
"HDFSUrl":"hdfs://master2.packone:8020",
|
||||
"HDFSPath":"/microoAll/",
|
||||
"HDFSPath":"/yg/microo/interpro/",
|
||||
"selectionConditions":"interpro.xml.gz"
|
||||
}
|
||||
},{
|
||||
"uuid":"2222",
|
||||
"name":"UnzipFilesOnHDFS",
|
||||
"bundle":"cn.piflow.bundle.http.UnzipFilesOnHDFS",
|
||||
"bundle":"cn.piflow.bundle.hdfs.UnzipFilesOnHDFS",
|
||||
"properties":{
|
||||
"isCustomize":"false",
|
||||
"filePath":"",
|
||||
"hdfsUrl":"hdfs://master2.packone:8020",
|
||||
"savePath":"/microoAll/inter/"
|
||||
"savePath":"/yg/microo/interpro/"
|
||||
|
||||
}
|
||||
},
|
||||
{
|
||||
"uuid": "3333",
|
||||
"name": "InterprodataParse",
|
||||
"bundle": "cn.piflow.bundle.microorganism.InterprodataParse",
|
||||
"bundle": "cn.piflow.bundle.microorganism.InterproData",
|
||||
"properties": {
|
||||
"cachePath": "/yg/microoCache/"
|
||||
}
|
||||
},
|
||||
{
|
||||
|
@ -37,9 +38,9 @@
|
|||
"bundle": "cn.piflow.bundle.es.PutEs",
|
||||
"properties": {
|
||||
"es_nodes": "10.0.88.70,10.0.88.71,10.0.88.72",
|
||||
"port": "9200",
|
||||
"es_index": "interpro",
|
||||
"es_type": "interpro"
|
||||
"es_port": "9200",
|
||||
"es_index": "test_14",
|
||||
"es_type": "test_14"
|
||||
}
|
||||
}
|
||||
],
|
||||
|
|
|
@ -15,9 +15,9 @@
|
|||
},{
|
||||
"uuid":"3333",
|
||||
"name":"MicrobeGenomeDataParser",
|
||||
"bundle":"cn.piflow.bundle.microorganism.MicrobeGenomeDataParser",
|
||||
"bundle":"cn.piflow.bundle.microorganism.MicrobeGenomeData",
|
||||
"properties":{
|
||||
|
||||
"cachePath":"/yqd/test/microbe/"
|
||||
}
|
||||
},{
|
||||
"uuid":"4444",
|
||||
|
@ -25,9 +25,9 @@
|
|||
"bundle":"cn.piflow.bundle.es.PutEs",
|
||||
"properties":{
|
||||
"es_nodes": "10.0.88.70,10.0.88.71,10.0.88.72",
|
||||
"port": "9200",
|
||||
"es_index": "yqd_ncbi_microbe_genome_index_delet",
|
||||
"es_type": "yqd_ncbi_microbe_genome_type_delet"
|
||||
"es_port": "9200",
|
||||
"es_index": "test_19",
|
||||
"es_type": "test_19"
|
||||
}
|
||||
}
|
||||
],
|
|
@ -0,0 +1,50 @@
|
|||
{
|
||||
"flow":{
|
||||
"name":"test",
|
||||
"uuid":"1234",
|
||||
"stops":[
|
||||
{
|
||||
"uuid":"1111",
|
||||
"name":"SelectFilesByName",
|
||||
"bundle":"cn.piflow.bundle.hdfs.SelectFilesByName",
|
||||
"properties":{
|
||||
"HDFSUrl":"hdfs://10.0.88.70:9000",
|
||||
"HDFSPath":"/yqd/test/pfam/",
|
||||
"selectionConditions":".*full"
|
||||
}
|
||||
},
|
||||
{
|
||||
"uuid":"3333",
|
||||
"name":"PfamData",
|
||||
"bundle":"cn.piflow.bundle.microorganism.PfamData",
|
||||
"properties":{
|
||||
"cachePath":"/yqd/test/pfam/"
|
||||
}
|
||||
},
|
||||
{
|
||||
"uuid":"4444",
|
||||
"name":"putEs",
|
||||
"bundle":"cn.piflow.bundle.es.PutEs",
|
||||
"properties":{
|
||||
"es_nodes": "10.0.88.70,10.0.88.71,10.0.88.72",
|
||||
"es_port": "9200",
|
||||
"es_index": "test_15",
|
||||
"es_type": "test_15"
|
||||
}
|
||||
}
|
||||
],
|
||||
"paths":[
|
||||
{
|
||||
"from":"SelectFilesByName",
|
||||
"outport":"",
|
||||
"inport":"",
|
||||
"to":"PfamData"
|
||||
},{
|
||||
"from":"PfamData",
|
||||
"outport":"",
|
||||
"inport":"",
|
||||
"to":"putEs"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
Before Width: | Height: | Size: 6.6 KiB |
Before Width: | Height: | Size: 7.0 KiB |
Before Width: | Height: | Size: 8.7 KiB |
Before Width: | Height: | Size: 7.3 KiB |
Before Width: | Height: | Size: 6.6 KiB |
Before Width: | Height: | Size: 8.3 KiB |
Before Width: | Height: | Size: 8.5 KiB |
Before Width: | Height: | Size: 10 KiB |
Before Width: | Height: | Size: 8.7 KiB |
Before Width: | Height: | Size: 7.6 KiB |
Before Width: | Height: | Size: 5.2 KiB |
Before Width: | Height: | Size: 6.1 KiB |
Before Width: | Height: | Size: 7.7 KiB |
Before Width: | Height: | Size: 6.6 KiB |
Before Width: | Height: | Size: 6.5 KiB |
|
@ -7,29 +7,29 @@
|
|||
{
|
||||
"uuid":"1111",
|
||||
"name":"SelectFilesByName",
|
||||
"bundle":"cn.piflow.bundle.ftp.SelectFilesByName",
|
||||
"bundle":"cn.piflow.bundle.hdfs.SelectFilesByName",
|
||||
"properties":{
|
||||
"HDFSUrl":"hdfs://10.0.88.70:9000",
|
||||
"HDFSPath":"/yqd/weishengwu/refseq/",
|
||||
"selectionConditions":".*genomic.gbff.gz"
|
||||
"HDFSPath":"/yqd/ftp/refseq/",
|
||||
"selectionConditions":"archaea.4.genomic.gbff.gz"
|
||||
}
|
||||
},{
|
||||
"uuid":"2222",
|
||||
"name":"UnzipFilesOnHDFS_1",
|
||||
"bundle":"cn.piflow.bundle.http.UnzipFilesOnHDFS_1",
|
||||
"name":"UnzipFilesOnHDFS",
|
||||
"bundle":"cn.piflow.bundle.hdfs.UnzipFilesOnHDFS",
|
||||
"properties":{
|
||||
"isCustomize":"false",
|
||||
"filePath":"",
|
||||
"fileType":"gz",
|
||||
"unzipPath":""
|
||||
|
||||
"hdfsUrl":"hdfs://10.0.88.70:9000",
|
||||
"savePath":"/yqd/test/refseq/"
|
||||
}
|
||||
},
|
||||
{
|
||||
"uuid":"3333",
|
||||
"name":"Refseq_genomeParser",
|
||||
"bundle":"cn.piflow.bundle.microorganism.Refseq_genomeParser",
|
||||
"bundle":"cn.piflow.bundle.microorganism.RefseqData",
|
||||
"properties":{
|
||||
"cachePath":"/yqd/test/refseq/"
|
||||
}
|
||||
},{
|
||||
"uuid":"4444",
|
||||
|
@ -37,9 +37,9 @@
|
|||
"bundle":"cn.piflow.bundle.es.PutEs",
|
||||
"properties":{
|
||||
"es_nodes": "10.0.88.70,10.0.88.71,10.0.88.72",
|
||||
"port": "9200",
|
||||
"es_index": "genome",
|
||||
"es_type": "archaea"
|
||||
"es_port": "9200",
|
||||
"es_index": "test_20",
|
||||
"es_type": "test_20"
|
||||
}
|
||||
}
|
||||
],
|
||||
|
@ -48,10 +48,10 @@
|
|||
"from":"SelectFilesByName",
|
||||
"outport":"",
|
||||
"inport":"",
|
||||
"to":"UnzipFilesOnHDFS_1"
|
||||
"to":"UnzipFilesOnHDFS"
|
||||
},
|
||||
{
|
||||
"from":"UnzipFilesOnHDFS_1",
|
||||
"from":"UnzipFilesOnHDFS",
|
||||
"outport":"",
|
||||
"inport":"",
|
||||
"to":"Refseq_genomeParser"
|
|
@ -1,37 +0,0 @@
|
|||
{
|
||||
"flow":{
|
||||
"name":"test",
|
||||
"uuid":"1234",
|
||||
"stops":[
|
||||
{
|
||||
"uuid":"0000",
|
||||
"name":"SelectFilesByName",
|
||||
"bundle":"cn.piflow.bundle.ftp.SelectFilesByName",
|
||||
"properties":{
|
||||
"HDFSUrl":"hdfs://10.0.88.70:9000",
|
||||
"HDFSPath":"/yqd/",
|
||||
"selectionConditions":".*genomic.gbff.gz"
|
||||
}
|
||||
},{
|
||||
"uuid":"1111",
|
||||
"name":"UnzipFilesOnHDFS_1",
|
||||
"bundle":"cn.piflow.bundle.http.UnzipFilesOnHDFS_1",
|
||||
"properties":{
|
||||
"isCustomize":"true",
|
||||
"filePath":"hdfs://10.0.88.70:9000/yqd/archaea.1.genomic.gbff.gz",
|
||||
"fileType":"gz",
|
||||
"unzipPath":"hdfs://10.0.88.70:9000/yqd/weishengwu/"
|
||||
|
||||
}
|
||||
}
|
||||
],
|
||||
"paths":[
|
||||
{
|
||||
"from":"SelectFilesByName",
|
||||
"outport":"",
|
||||
"inport":"",
|
||||
"to":"UnzipFilesOnHDFS_1"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
|
@ -6,18 +6,29 @@
|
|||
{
|
||||
"uuid":"1111",
|
||||
"name":"SelectFilesByName",
|
||||
"bundle":"cn.piflow.bundle.ftp.SelectFilesByName",
|
||||
"bundle":"cn.piflow.bundle.hdfs.SelectFilesByName",
|
||||
"properties":{
|
||||
"HDFSUrl":"hdfs://10.0.88.70:9000",
|
||||
"HDFSPath":"/yqd/weishengwu/Swissprot_TrEMBL/",
|
||||
"selectionConditions":"uniprot_sprot.dat"
|
||||
"HDFSPath":"/yqd/ftp/swiss/",
|
||||
"selectionConditions":".*trembl.dat.gz"
|
||||
}
|
||||
},{
|
||||
"uuid":"3333",
|
||||
"name":"Swissprot_TrEMBLDataParser",
|
||||
"bundle":"cn.piflow.bundle.microorganism.Swissprot_TrEMBLDataParser",
|
||||
"uuid":"2222",
|
||||
"name":"UnzipFilesOnHDFS",
|
||||
"bundle":"cn.piflow.bundle.hdfs.UnzipFilesOnHDFS",
|
||||
"properties":{
|
||||
|
||||
"isCustomize":"false",
|
||||
"filePath":"",
|
||||
"hdfsUrl":"hdfs://10.0.88.70:9000",
|
||||
"savePath":"/yqd/test/swiss/"
|
||||
}
|
||||
},
|
||||
{
|
||||
"uuid":"3333",
|
||||
"name":"SwissprotData",
|
||||
"bundle":"cn.piflow.bundle.microorganism.SwissprotData",
|
||||
"properties":{
|
||||
"cachePath":"/yqd/test/swiss/"
|
||||
}
|
||||
},{
|
||||
"uuid":"4444",
|
||||
|
@ -25,9 +36,9 @@
|
|||
"bundle":"cn.piflow.bundle.es.PutEs",
|
||||
"properties":{
|
||||
"es_nodes": "10.0.88.70,10.0.88.71,10.0.88.72",
|
||||
"port": "9200",
|
||||
"es_index": "Swissprot_TrEMBL_index",
|
||||
"es_type": "Swissprot_TrEMBL_type"
|
||||
"es_port": "9200",
|
||||
"es_index": "test_18",
|
||||
"es_type": "test_18"
|
||||
}
|
||||
}
|
||||
],
|
||||
|
@ -36,9 +47,15 @@
|
|||
"from":"SelectFilesByName",
|
||||
"outport":"",
|
||||
"inport":"",
|
||||
"to":"Swissprot_TrEMBLDataParser"
|
||||
"to":"UnzipFilesOnHDFS"
|
||||
},
|
||||
{
|
||||
"from":"UnzipFilesOnHDFS",
|
||||
"outport":"",
|
||||
"inport":"",
|
||||
"to":"SwissprotData"
|
||||
},{
|
||||
"from":"Swissprot_TrEMBLDataParser",
|
||||
"from":"SwissprotData",
|
||||
"outport":"",
|
||||
"inport":"",
|
||||
"to":"PutEs"
|
||||
|
|
|
@ -48,10 +48,14 @@ class FetchEs extends ConfigurableStop {
|
|||
|
||||
override def getPropertyDescriptor(): List[PropertyDescriptor] = {
|
||||
var descriptor : List[PropertyDescriptor] = List()
|
||||
val es_nodes = new PropertyDescriptor().name("es_nodes").displayName("es_nodes").defaultValue("Node of Elasticsearch").required(true)
|
||||
val es_port = new PropertyDescriptor().name("es_port").displayName("es_port").defaultValue("Port of Elasticsearch").required(true)
|
||||
val es_index = new PropertyDescriptor().name("es_index").displayName("es_index").defaultValue("Index of Elasticsearch").required(true)
|
||||
val es_type = new PropertyDescriptor().name("es_type").displayName("es_type").defaultValue("Type of Elasticsearch").required(true)
|
||||
val es_nodes = new PropertyDescriptor().name("es_nodes").displayName("es_nodes")
|
||||
.description("Node of Elasticsearch").defaultValue("").required(true)
|
||||
val es_port = new PropertyDescriptor().defaultValue("9200").name("es_port").displayName("es_port")
|
||||
.description("Port of Elasticsearch").required(true)
|
||||
val es_index = new PropertyDescriptor().name("es_index").displayName("es_index")
|
||||
.description("Index of Elasticsearch").defaultValue("").required(true)
|
||||
val es_type = new PropertyDescriptor().name("es_type").displayName("es_type")
|
||||
.description("Type of Elasticsearch").defaultValue("").required(true)
|
||||
|
||||
|
||||
descriptor = es_nodes :: descriptor
|
||||
|
|
|
@ -47,11 +47,14 @@ class PutEs extends ConfigurableStop {
|
|||
|
||||
override def getPropertyDescriptor(): List[PropertyDescriptor] = {
|
||||
var descriptor : List[PropertyDescriptor] = List()
|
||||
val es_nodes = new PropertyDescriptor().name("es_nodes").displayName("es_nodes").defaultValue("Node of Elasticsearch").required(true)
|
||||
val es_port = new PropertyDescriptor().name("es_port").displayName("es_port").defaultValue("Port of Elasticsearch").required(true)
|
||||
val es_index = new PropertyDescriptor().name("es_index").displayName("es_index").defaultValue("Index of Elasticsearch").required(true)
|
||||
val es_type = new PropertyDescriptor().name("es_type").displayName("es_type").defaultValue("Type of Elasticsearch").required(true)
|
||||
|
||||
val es_nodes = new PropertyDescriptor().name("es_nodes").displayName("es_nodes")
|
||||
.description("Node of Elasticsearch").defaultValue("").required(true)
|
||||
val es_port = new PropertyDescriptor().defaultValue("9200").name("es_port").displayName("es_port")
|
||||
.description("Port of Elasticsearch").required(true)
|
||||
val es_index = new PropertyDescriptor().name("es_index").displayName("es_index")
|
||||
.description("Index of Elasticsearch").defaultValue("").required(true)
|
||||
val es_type = new PropertyDescriptor().name("es_type").displayName("es_type")
|
||||
.description("Type of Elasticsearch").defaultValue("").required(true)
|
||||
|
||||
descriptor = es_nodes :: descriptor
|
||||
descriptor = es_port :: descriptor
|
||||
|
|
|
@ -50,16 +50,22 @@ class QueryEs extends ConfigurableStop {
|
|||
|
||||
override def getPropertyDescriptor(): List[PropertyDescriptor] = {
|
||||
var descriptor : List[PropertyDescriptor] = List()
|
||||
val es_nodes = new PropertyDescriptor().name("es_nodes").displayName("es_nodes").defaultValue("Node of Elasticsearch").required(true)
|
||||
val es_port = new PropertyDescriptor().name("es_port").displayName("es_port").defaultValue("Port of Elasticsearch").required(true)
|
||||
val es_index = new PropertyDescriptor().name("es_index").displayName("es_index").defaultValue("Index of Elasticsearch").required(true)
|
||||
val es_type = new PropertyDescriptor().name("es_type").displayName("es_type").defaultValue("Type of Elasticsearch").required(true)
|
||||
|
||||
val es_nodes = new PropertyDescriptor().name("es_nodes").displayName("es_nodes")
|
||||
.description("Node of Elasticsearch").defaultValue("").required(true)
|
||||
val es_port = new PropertyDescriptor().defaultValue("9200").name("es_port").displayName("es_port")
|
||||
.description("Port of Elasticsearch").required(true)
|
||||
val es_index = new PropertyDescriptor().name("es_index").displayName("es_index")
|
||||
.description("Index of Elasticsearch").defaultValue("").required(true)
|
||||
val es_type = new PropertyDescriptor().name("es_type").displayName("es_type")
|
||||
.description("Type of Elasticsearch").defaultValue("").required(true)
|
||||
val jsonDSL = new PropertyDescriptor().name("jsonDSL").displayName("jsonDSL")
|
||||
.description("DSL of Elasticsearch").defaultValue("").required(true)
|
||||
|
||||
descriptor = es_nodes :: descriptor
|
||||
descriptor = es_port :: descriptor
|
||||
descriptor = es_index :: descriptor
|
||||
descriptor = es_type :: descriptor
|
||||
descriptor = jsonDSL :: descriptor
|
||||
|
||||
descriptor
|
||||
}
|
||||
|
|
|
@ -14,39 +14,67 @@ import org.apache.hadoop.fs.Path
|
|||
class DeleteHdfs extends ConfigurableStop{
|
||||
override val authorEmail: String = "ygang@cmic.com"
|
||||
|
||||
override val inportList: List[String] = List(PortEnum.NonePort.toString)
|
||||
override val outportList: List[String] = List(PortEnum.NonePort.toString)
|
||||
override val inportList: List[String] = List(PortEnum.DefaultPort.toString)
|
||||
override val outportList: List[String] = List(PortEnum.DefaultPort.toString)
|
||||
override val description: String = "Delete file or directory on hdfs"
|
||||
|
||||
var hdfsUrl :String= _
|
||||
var deletePath :String = _
|
||||
var isCustomize:String=_
|
||||
override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
|
||||
|
||||
val spark = pec.get[SparkSession]()
|
||||
|
||||
val array = deletePath.split(",")
|
||||
if (isCustomize.equals("false")){
|
||||
val inDf = in.read()
|
||||
|
||||
for (i<- 0 until array.length){
|
||||
val hdfsPath = hdfsUrl+array(i)
|
||||
val path = new Path(array(i))
|
||||
val configuration: Configuration = new Configuration()
|
||||
var pathStr: String =inDf.take(1)(0).get(0).asInstanceOf[String]
|
||||
val pathARR: Array[String] = pathStr.split("\\/")
|
||||
var hdfsUrl:String=""
|
||||
for (x <- (0 until 3)){
|
||||
hdfsUrl+=(pathARR(x) +"/")
|
||||
}
|
||||
configuration.set("fs.defaultFS",hdfsUrl)
|
||||
|
||||
val config = new Configuration()
|
||||
config.set("fs.defaultFS",hdfsUrl)
|
||||
val fs = FileSystem.get(config)
|
||||
var fs: FileSystem = FileSystem.get(configuration)
|
||||
|
||||
fs.delete(path,true)
|
||||
inDf.collect.foreach(row=>{
|
||||
pathStr = row.get(0).asInstanceOf[String]
|
||||
val path = new Path(pathStr)
|
||||
fs.delete(path,true)
|
||||
})
|
||||
|
||||
} else {
|
||||
val array = deletePath.split(",")
|
||||
|
||||
for (i<- 0 until array.length){
|
||||
val hdfsPath = hdfsUrl+"/"+array(i)
|
||||
val path = new Path(hdfsPath)
|
||||
|
||||
val config = new Configuration()
|
||||
config.set("fs.defaultFS",hdfsUrl)
|
||||
val fs = FileSystem.get(config)
|
||||
|
||||
fs.delete(path,true)
|
||||
}
|
||||
}
|
||||
}
|
||||
override def setProperties(map: Map[String, Any]): Unit = {
|
||||
hdfsUrl = MapUtil.get(map,key="hdfsUrl").asInstanceOf[String]
|
||||
deletePath = MapUtil.get(map,key="deletePath").asInstanceOf[String]
|
||||
isCustomize=MapUtil.get(map,key="isCustomize").asInstanceOf[String]
|
||||
}
|
||||
|
||||
override def getPropertyDescriptor(): List[PropertyDescriptor] = {
|
||||
var descriptor : List[PropertyDescriptor] = List()
|
||||
val hdfsUrl = new PropertyDescriptor().name("hdfsUrl").displayName("hdfsUrl").defaultValue("").required(true)
|
||||
val deletePath = new PropertyDescriptor().name("deletePath").displayName("deletePath").defaultValue("").required(true)
|
||||
val isCustomize = new PropertyDescriptor().name("isCustomize").displayName("isCustomize").description("Whether to customize the compressed file path, if true, " +
|
||||
"you must specify the path where the compressed file is located . " +
|
||||
"If it is false, it will automatically find the file path data from the upstream port ")
|
||||
.defaultValue("false").allowableValues(Set("true","false")).required(true)
|
||||
descriptor = isCustomize :: descriptor
|
||||
descriptor = hdfsUrl :: descriptor
|
||||
descriptor = deletePath :: descriptor
|
||||
descriptor
|
||||
|
|
|
@ -10,7 +10,7 @@ import org.apache.spark.sql.SparkSession
|
|||
class GetHdfs extends ConfigurableStop{
|
||||
override val authorEmail: String = "ygang@cmic.com"
|
||||
override val description: String = "Get data from hdfs"
|
||||
override val inportList: List[String] = List(PortEnum.NonePort.toString)
|
||||
override val inportList: List[String] = List(PortEnum.DefaultPort.toString)
|
||||
override val outportList: List[String] = List(PortEnum.DefaultPort.toString)
|
||||
|
||||
var hdfsUrl : String=_
|
||||
|
@ -45,7 +45,7 @@ class GetHdfs extends ConfigurableStop{
|
|||
}
|
||||
else {
|
||||
val rdd = sc.textFile(path)
|
||||
val outDf = rdd.toDF("txt")
|
||||
val outDf = rdd.toDF()
|
||||
outDf.schema.printTreeString()
|
||||
//outDf.show()
|
||||
out.write(outDf)
|
||||
|
@ -60,9 +60,13 @@ class GetHdfs extends ConfigurableStop{
|
|||
|
||||
override def getPropertyDescriptor(): List[PropertyDescriptor] = {
|
||||
var descriptor : List[PropertyDescriptor] = List()
|
||||
val hdfsPath = new PropertyDescriptor().name("hdfsPath").displayName("hdfsPath").defaultValue("").required(true)
|
||||
val hdfsUrl = new PropertyDescriptor().name("hdfsUrl").displayName("hdfsUrl").defaultValue("").required(true)
|
||||
val types = new PropertyDescriptor().name("types").displayName("types").defaultValue("txt,parquet,csv,json").required(true)
|
||||
val hdfsPath = new PropertyDescriptor().name("hdfsPath").displayName("hdfsPath")
|
||||
.defaultValue("").required(true)
|
||||
val hdfsUrl = new PropertyDescriptor().name("hdfsUrl").displayName("hdfsUrl")
|
||||
.defaultValue("").required(true)
|
||||
val types = new PropertyDescriptor().name("types").displayName("types").description("txt,parquet,csv,json")
|
||||
.defaultValue("txt").allowableValues(Set("txt","parquet","csv","json")).required(true)
|
||||
|
||||
descriptor = types :: descriptor
|
||||
descriptor = hdfsPath :: descriptor
|
||||
descriptor = hdfsUrl :: descriptor
|
||||
|
|
|
@ -6,38 +6,54 @@ import cn.piflow.conf.util.{ImageUtil, MapUtil}
|
|||
import cn.piflow.conf.{ConfigurableStop, PortEnum, StopGroup}
|
||||
import org.apache.hadoop.conf.Configuration
|
||||
import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}
|
||||
import org.apache.spark.sql. SparkSession
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.sql.types.{StringType, StructField, StructType}
|
||||
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
|
||||
|
||||
import scala.collection.mutable.ArrayBuffer
|
||||
|
||||
|
||||
|
||||
class ListHdfs extends ConfigurableStop{
|
||||
override val authorEmail: String = "ygang@cmic.com"
|
||||
|
||||
override val inportList: List[String] = List(PortEnum.NonePort.toString)
|
||||
override val inportList: List[String] = List(PortEnum.DefaultPort.toString)
|
||||
override val outportList: List[String] = List(PortEnum.DefaultPort.toString)
|
||||
override val description: String = "Retrieve a list of files from hdfs"
|
||||
|
||||
var hdfsPath :String= _
|
||||
var hdfsUrl :String= _
|
||||
var list = List("")
|
||||
var HDFSPath :String= _
|
||||
var HDFSUrl :String= _
|
||||
var pathARR:ArrayBuffer[String]=ArrayBuffer()
|
||||
override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
|
||||
val spark = pec.get[SparkSession]()
|
||||
val sc = spark.sparkContext
|
||||
|
||||
val path = new Path(hdfsPath)
|
||||
|
||||
val path = new Path(HDFSPath)
|
||||
iterationFile(path.toString)
|
||||
|
||||
import spark.implicits._
|
||||
|
||||
val outDF = sc.parallelize(list).toDF()
|
||||
val rows: List[Row] = pathARR.map(each => {
|
||||
var arr:Array[String]=Array(each)
|
||||
val row: Row = Row.fromSeq(arr)
|
||||
row
|
||||
}).toList
|
||||
|
||||
val rowRDD: RDD[Row] = sc.makeRDD(rows)
|
||||
// val fields: Array[StructField] = "path".split("/").map(d=>StructField(d,StringType,nullable = true))
|
||||
// val schema: StructType = StructType(fields)
|
||||
|
||||
val schema: StructType = StructType(Array(
|
||||
StructField("path",StringType)
|
||||
))
|
||||
val outDF: DataFrame = spark.createDataFrame(rowRDD,schema)
|
||||
out.write(outDF)
|
||||
}
|
||||
|
||||
// recursively traverse the folder
|
||||
def iterationFile(path: String):Unit = {
|
||||
val config = new Configuration()
|
||||
config.set("fs.defaultFS",hdfsUrl)
|
||||
config.set("fs.defaultFS",HDFSUrl)
|
||||
val fs = FileSystem.get(config)
|
||||
val listf = new Path(path)
|
||||
|
||||
|
@ -45,28 +61,25 @@ class ListHdfs extends ConfigurableStop{
|
|||
|
||||
for (f <- statuses) {
|
||||
val fsPath = f.getPath().toString
|
||||
//println(fsPath)
|
||||
|
||||
if (f.isDirectory) {
|
||||
list = fsPath::list
|
||||
// pathARR += fsPath
|
||||
iterationFile(fsPath)
|
||||
|
||||
} else{
|
||||
list = f.getPath.toString::list
|
||||
pathARR += f.getPath.toString
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
override def setProperties(map: Map[String, Any]): Unit = {
|
||||
hdfsUrl = MapUtil.get(map,key="hdfsUrl").asInstanceOf[String]
|
||||
hdfsPath = MapUtil.get(map,key="hdfsPath").asInstanceOf[String]
|
||||
HDFSUrl = MapUtil.get(map,key="HDFSUrl").asInstanceOf[String]
|
||||
HDFSPath = MapUtil.get(map,key="HDFSPath").asInstanceOf[String]
|
||||
}
|
||||
|
||||
override def getPropertyDescriptor(): List[PropertyDescriptor] = {
|
||||
var descriptor : List[PropertyDescriptor] = List()
|
||||
val hdfsPath = new PropertyDescriptor().name("hdfsPath").displayName("hdfsPath").defaultValue("").required(true)
|
||||
val hdfsUrl = new PropertyDescriptor().name("hdfsUrl").displayName("hdfsUrl").defaultValue("").required(true)
|
||||
val hdfsPath = new PropertyDescriptor().name("HDFSPath").displayName("HDFSPath").defaultValue("").required(true)
|
||||
val hdfsUrl = new PropertyDescriptor().name("HDFSUrl").displayName("HDFSUrl").defaultValue("").required(true)
|
||||
descriptor = hdfsPath :: descriptor
|
||||
descriptor = hdfsUrl :: descriptor
|
||||
descriptor
|
||||
|
|
|
@ -13,7 +13,7 @@ import org.apache.spark.sql.SparkSession
|
|||
class PutHdfs extends ConfigurableStop{
|
||||
override val authorEmail: String = "ygang@cmic.com"
|
||||
override val inportList: List[String] = List(PortEnum.DefaultPort.toString)
|
||||
override val outportList: List[String] = List(PortEnum.NonePort.toString)
|
||||
override val outportList: List[String] = List(PortEnum.DefaultPort.toString)
|
||||
override val description: String = "Put data to hdfs"
|
||||
|
||||
var hdfsPath :String= _
|
||||
|
|
|
@ -17,7 +17,7 @@ import scala.collection.mutable.ArrayBuffer
|
|||
class SelectFilesByName extends ConfigurableStop{
|
||||
override val authorEmail: String = "yangqidong@cnic.cn"
|
||||
override val description: String = "Select files by file name"
|
||||
override val inportList: List[String] = List(PortEnum.NonePort.toString)
|
||||
override val inportList: List[String] = List(PortEnum.DefaultPort.toString)
|
||||
override val outportList: List[String] = List(PortEnum.DefaultPort.toString)
|
||||
|
||||
var HDFSUrl:String=_
|
||||
|
@ -68,10 +68,11 @@ class SelectFilesByName extends ConfigurableStop{
|
|||
val schema: StructType = StructType(fields)
|
||||
val df: DataFrame = session.createDataFrame(rowRDD,schema)
|
||||
|
||||
|
||||
println("#################################################")
|
||||
df.show(20)
|
||||
println("#################################################")
|
||||
println(df.count+"#################################################")
|
||||
|
||||
|
||||
|
||||
out.write(df)
|
||||
}
|
||||
|
@ -85,9 +86,12 @@ class SelectFilesByName extends ConfigurableStop{
|
|||
|
||||
override def getPropertyDescriptor(): List[PropertyDescriptor] = {
|
||||
var descriptor : List[PropertyDescriptor] = List()
|
||||
val HDFSUrl = new PropertyDescriptor().name("HDFSUrl").displayName("HDFSUrl").defaultValue("The URL of the HDFS file system, such as hdfs://10.0.88.70:9000").required(true)
|
||||
val HDFSPath = new PropertyDescriptor().name("HDFSPath").displayName("HDFSPath").defaultValue("The save path of the HDFS file system, such as /test/Ab").required(true)
|
||||
val selectionConditions = new PropertyDescriptor().name("selectionConditions").displayName("selectionConditions").defaultValue("To select conditions, you need to fill in regular expressions in java, such as. * abc. *").required(true)
|
||||
val HDFSUrl = new PropertyDescriptor().name("HDFSUrl").displayName("HDFSUrl").description("The URL of the HDFS file system, such as hdfs://ip:port")
|
||||
.defaultValue("hdfs://").required(true)
|
||||
val HDFSPath = new PropertyDescriptor().name("HDFSPath").displayName("HDFSPath").description("The save path of the HDFS file system, such as /test/Ab")
|
||||
.defaultValue("").required(true)
|
||||
val selectionConditions = new PropertyDescriptor().name("selectionConditions").displayName("selectionConditions").description("To select conditions, you need to fill in regular expressions in java, such as. * abc. *")
|
||||
.defaultValue("").required(true)
|
||||
descriptor = HDFSUrl :: descriptor
|
||||
descriptor = HDFSPath :: descriptor
|
||||
descriptor = selectionConditions :: descriptor
|
||||
|
|
|
@ -186,7 +186,7 @@ class UnzipFilesOnHDFS extends ConfigurableStop {
|
|||
val isCustomize = new PropertyDescriptor().name("isCustomize").displayName("isCustomize").description("Whether to customize the compressed file path, if true, " +
|
||||
"you must specify the path where the compressed file is located . " +
|
||||
"If it is false, it will automatically find the file path data from the upstream port ")
|
||||
.defaultValue("").required(false)
|
||||
.defaultValue("false").allowableValues(Set("true","false")).required(true)
|
||||
descriptor = isCustomize :: descriptor
|
||||
descriptor = filePath :: descriptor
|
||||
descriptor = hdfsUrl :: descriptor
|
||||
|
|
|
@ -41,17 +41,15 @@ class BioProjetData extends ConfigurableStop{
|
|||
configuration.set("fs.defaultFS",hdfsUrl)
|
||||
var fs: FileSystem = FileSystem.get(configuration)
|
||||
|
||||
val hdfsPathTemporary = hdfsUrl+cachePath+"/bioprojectCatch/bioprojectCatch.json"
|
||||
val hdfsPathTemporary = hdfsUrl+cachePath+"/bioprojectCache/bioprojectCache.json"
|
||||
|
||||
val path: Path = new Path(hdfsPathTemporary)
|
||||
if(fs.exists(path)){
|
||||
fs.delete(path)
|
||||
}
|
||||
fs.create(path).close()
|
||||
var fdosOut: FSDataOutputStream = fs.append(path)
|
||||
var jsonStr: String =""
|
||||
var bisIn: BufferedInputStream =null
|
||||
|
||||
val hdfsWriter: OutputStreamWriter = new OutputStreamWriter(fs.append(path))
|
||||
|
||||
inDf.collect().foreach(row =>{
|
||||
val pathStr = row.get(0).asInstanceOf[String]
|
||||
|
@ -64,23 +62,20 @@ class BioProjetData extends ConfigurableStop{
|
|||
br.readLine()
|
||||
i = i + 1
|
||||
}
|
||||
var count = 0
|
||||
|
||||
var xml = new StringBuffer()
|
||||
var x = 0
|
||||
var count = 0
|
||||
while ((line = br.readLine()) != null && x <1 && line!= null ) {
|
||||
|
||||
xml.append(line)
|
||||
if (line.equals("</PackageSet>")){
|
||||
println("----------------------------------break")
|
||||
println("break")
|
||||
x == 1
|
||||
} else if (line.indexOf("</" + name + ">") != -1){ //reach the end of a doc
|
||||
count = count + 1
|
||||
val doc: JSONObject = XML.toJSONObject(xml.toString()).getJSONObject(name)
|
||||
|
||||
val accession = doc.getJSONObject("Project").getJSONObject("Project")
|
||||
.getJSONObject("ProjectID")
|
||||
.getJSONObject("ArchiveID")
|
||||
.getString("accession")
|
||||
|
||||
val projectDescr = doc.getJSONObject("Project").getJSONObject("Project")
|
||||
.getJSONObject("ProjectDescr")
|
||||
|
||||
|
@ -115,7 +110,7 @@ class BioProjetData extends ConfigurableStop{
|
|||
}
|
||||
|
||||
// ----------------3
|
||||
if(doc.getJSONObject("Project").optJSONObject("Submission") != null){
|
||||
if(doc.getJSONObject("Project").optJSONObject("Submission") != null){
|
||||
val submission = doc.getJSONObject("Project").optJSONObject("Submission")
|
||||
|
||||
if(submission.opt("submitted") != null){
|
||||
|
@ -166,27 +161,19 @@ class BioProjetData extends ConfigurableStop{
|
|||
}
|
||||
}
|
||||
|
||||
bisIn = new BufferedInputStream(new ByteArrayInputStream((doc.toString+"\n").getBytes()))
|
||||
|
||||
val buff: Array[Byte] = new Array[Byte](1048576)
|
||||
|
||||
var num: Int = bisIn.read(buff)
|
||||
while (num != -1) {
|
||||
fdosOut.write(buff, 0, num)
|
||||
fdosOut.flush()
|
||||
num = bisIn.read(buff)
|
||||
}
|
||||
fdosOut.flush()
|
||||
bisIn = null
|
||||
doc.write(hdfsWriter)
|
||||
hdfsWriter.write("\n")
|
||||
|
||||
xml = new StringBuffer()
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
br.close()
|
||||
fdis.close()
|
||||
})
|
||||
|
||||
fdosOut.close()
|
||||
|
||||
hdfsWriter.close()
|
||||
println("start parser HDFSjsonFile")
|
||||
val df: DataFrame = spark.read.json(hdfsPathTemporary)
|
||||
out.write(df)
|
||||
|
@ -199,7 +186,8 @@ class BioProjetData extends ConfigurableStop{
|
|||
|
||||
override def getPropertyDescriptor(): List[PropertyDescriptor] = {
|
||||
var descriptor : List[PropertyDescriptor] = List()
|
||||
val cachePath = new PropertyDescriptor().name("cachePath").displayName("cachePath").defaultValue("").required(true)
|
||||
val cachePath = new PropertyDescriptor().name("cachePath").displayName("cachePath").description("Temporary Cache File Path")
|
||||
.defaultValue("/bioproject").required(true)
|
||||
descriptor = cachePath :: descriptor
|
||||
descriptor
|
||||
}
|
||||
|
|
|
@ -25,7 +25,6 @@ class BioSample extends ConfigurableStop{
|
|||
def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
|
||||
val spark = pec.get[SparkSession]()
|
||||
val sc = spark.sparkContext
|
||||
// val ssc = spark.sqlContext
|
||||
val inDf= in.read()
|
||||
|
||||
val configuration: Configuration = new Configuration()
|
||||
|
@ -35,27 +34,22 @@ class BioSample extends ConfigurableStop{
|
|||
for (x <- (0 until 3)){
|
||||
hdfsUrl+=(pathARR(x) +"/")
|
||||
}
|
||||
|
||||
configuration.set("fs.defaultFS",hdfsUrl)
|
||||
var fs: FileSystem = FileSystem.get(configuration)
|
||||
|
||||
var hdfsPathJsonCache:String = ""
|
||||
|
||||
hdfsPathJsonCache = hdfsUrl+cachePath+"/biosampleCache/biosampleCache.json"
|
||||
val hdfsPathTemporary = hdfsUrl+cachePath+"/biosampleCache/biosampleCache.json"
|
||||
|
||||
val path: Path = new Path(hdfsPathJsonCache)
|
||||
val path: Path = new Path(hdfsPathTemporary)
|
||||
if(fs.exists(path)){
|
||||
fs.delete(path)
|
||||
}
|
||||
fs.create(path).close()
|
||||
|
||||
val hdfsWriter: OutputStreamWriter = new OutputStreamWriter(fs.append(path))
|
||||
|
||||
var fdosOut: FSDataOutputStream = null
|
||||
var bisIn: BufferedInputStream =null
|
||||
fdosOut = fs.append(path)
|
||||
|
||||
var count = 0
|
||||
var nameNum = 0
|
||||
inDf.collect().foreach(row => {
|
||||
pathStr = row.get(0).asInstanceOf[String]
|
||||
|
||||
|
@ -66,8 +60,7 @@ class BioSample extends ConfigurableStop{
|
|||
br.readLine()
|
||||
br.readLine()
|
||||
|
||||
|
||||
while ((line = br.readLine()) != null && line!= null) {
|
||||
while ((line = br.readLine()) != null && line!= null ) {
|
||||
xml = xml + line
|
||||
if (line.indexOf("</" + docName + ">") != -1) {
|
||||
count = count + 1
|
||||
|
@ -79,7 +72,6 @@ class BioSample extends ConfigurableStop{
|
|||
if (attrs.equals("")) {
|
||||
doc.remove("Attributes")
|
||||
}
|
||||
|
||||
// Links
|
||||
val links: String = doc.optString("Links")
|
||||
if (links != null) {
|
||||
|
@ -123,36 +115,24 @@ class BioSample extends ConfigurableStop{
|
|||
if (jsonDoc.contains("Ids\":{\"Id\":[{")){
|
||||
|
||||
} else {
|
||||
// println(count)
|
||||
// println(jsonDoc)
|
||||
jsonDoc = jsonDoc.replace("Ids\":{\"Id\":{","Ids\":{\"Id\":[{")
|
||||
}
|
||||
}
|
||||
|
||||
bisIn = new BufferedInputStream(new ByteArrayInputStream((jsonDoc+"\n").getBytes()))
|
||||
|
||||
val buff: Array[Byte] = new Array[Byte](1048576)
|
||||
|
||||
var num: Int = bisIn.read(buff)
|
||||
while (num != -1) {
|
||||
fdosOut.write(buff, 0, num)
|
||||
fdosOut.flush()
|
||||
num = bisIn.read(buff)
|
||||
}
|
||||
fdosOut.flush()
|
||||
bisIn = null
|
||||
doc.write(hdfsWriter)
|
||||
hdfsWriter.write("\n")
|
||||
|
||||
xml = ""
|
||||
}
|
||||
}
|
||||
br.close()
|
||||
fdis.close()
|
||||
})
|
||||
hdfsWriter.close()
|
||||
|
||||
// bisIn.close()
|
||||
fdosOut.close()
|
||||
println("start parser HDFSjsonFile")
|
||||
|
||||
val df: DataFrame = spark.read.json(hdfsPathJsonCache)
|
||||
|
||||
val df: DataFrame = spark.read.json(hdfsPathTemporary)
|
||||
out.write(df)
|
||||
}
|
||||
|
||||
|
@ -164,7 +144,8 @@ class BioSample extends ConfigurableStop{
|
|||
|
||||
override def getPropertyDescriptor(): List[PropertyDescriptor] = {
|
||||
var descriptor : List[PropertyDescriptor] = List()
|
||||
val cachePath = new PropertyDescriptor().name("cachePath").displayName("cachePath").defaultValue("").required(true)
|
||||
val cachePath = new PropertyDescriptor().name("cachePath").displayName("cachePath").description("Temporary Cache File Path")
|
||||
.defaultValue("/biosample").required(true)
|
||||
descriptor = cachePath :: descriptor
|
||||
descriptor
|
||||
}
|
||||
|
|
|
@ -4,7 +4,7 @@ import java.io._
|
|||
|
||||
import cn.piflow.bundle.microorganism.util.{CustomIOTools, Process}
|
||||
import cn.piflow.conf.bean.PropertyDescriptor
|
||||
import cn.piflow.conf.util.ImageUtil
|
||||
import cn.piflow.conf.util.{ImageUtil, MapUtil}
|
||||
import cn.piflow.conf.{ConfigurableStop, PortEnum, StopGroup}
|
||||
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
|
||||
import org.apache.hadoop.conf.Configuration
|
||||
|
@ -20,13 +20,12 @@ class EmblData extends ConfigurableStop{
|
|||
override val outportList: List[String] = List(PortEnum.DefaultPort.toString)
|
||||
|
||||
|
||||
|
||||
var cachePath:String = _
|
||||
override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
|
||||
val session = pec.get[SparkSession]()
|
||||
|
||||
val inDf: DataFrame = in.read()
|
||||
val configuration: Configuration = new Configuration()
|
||||
|
||||
val configuration: Configuration = new Configuration()
|
||||
var pathStr: String =inDf.take(1)(0).get(0).asInstanceOf[String]
|
||||
val pathARR: Array[String] = pathStr.split("\\/")
|
||||
var hdfsUrl:String=""
|
||||
|
@ -36,85 +35,56 @@ class EmblData extends ConfigurableStop{
|
|||
configuration.set("fs.defaultFS",hdfsUrl)
|
||||
var fs: FileSystem = FileSystem.get(configuration)
|
||||
|
||||
val hdfsPathTemporary:String = hdfsUrl+"/Refseq_genomeParser_temporary.json"
|
||||
val path: Path = new Path(hdfsPathTemporary)
|
||||
val hdfsPathTemporary = hdfsUrl+cachePath+"/emblDataCache/emblDataCache.json"
|
||||
|
||||
val path: Path = new Path(hdfsPathTemporary)
|
||||
if(fs.exists(path)){
|
||||
fs.delete(path)
|
||||
}
|
||||
|
||||
fs.create(path).close()
|
||||
var fdos: FSDataOutputStream = fs.append(path)
|
||||
|
||||
var jsonStr: String =""
|
||||
val hdfsWriter: OutputStreamWriter = new OutputStreamWriter(fs.append(path))
|
||||
|
||||
var bis: BufferedInputStream =null
|
||||
var doc: JSONObject = null
|
||||
var seq: RichSequence = null
|
||||
var br: BufferedReader = null
|
||||
var sequences: RichSequenceIterator = null
|
||||
var count : Int =0
|
||||
|
||||
inDf.collect().foreach(row => {
|
||||
|
||||
var n : Int =0
|
||||
pathStr = row.get(0).asInstanceOf[String]
|
||||
pathStr = row.get(0).asInstanceOf[String]
|
||||
var fdis: FSDataInputStream = fs.open(new Path(pathStr))
|
||||
br = new BufferedReader(new InputStreamReader(fdis))
|
||||
|
||||
var fdis: FSDataInputStream = fs.open(new Path(pathStr))
|
||||
sequences = CustomIOTools.IOTools.readEMBLDNA(br, null)
|
||||
while (sequences.hasNext) {
|
||||
count += 1
|
||||
seq = sequences.nextRichSequence()
|
||||
doc = new JSONObject
|
||||
Process.processEMBL_EnsemblSeq(seq, doc)
|
||||
|
||||
var br: BufferedReader = new BufferedReader(new InputStreamReader(fdis))
|
||||
|
||||
var sequences: RichSequenceIterator = CustomIOTools.IOTools.readEMBLDNA (br, null)
|
||||
|
||||
while (sequences.hasNext) {
|
||||
n += 1
|
||||
var seq: RichSequence = sequences.nextRichSequence()
|
||||
var doc: JSONObject = new JSONObject
|
||||
Process.processEMBL_EnsemblSeq(seq, doc)
|
||||
jsonStr = doc.toString
|
||||
println("start " + n)
|
||||
|
||||
if (n == 1) {
|
||||
bis = new BufferedInputStream(new ByteArrayInputStream(("[" + jsonStr).getBytes()))
|
||||
} else {
|
||||
bis = new BufferedInputStream(new ByteArrayInputStream(("," + jsonStr).getBytes()))
|
||||
doc.write(hdfsWriter)
|
||||
hdfsWriter.write("\n")
|
||||
}
|
||||
br.close()
|
||||
fdis.close()
|
||||
|
||||
val buff: Array[Byte] = new Array[Byte](1048576)
|
||||
|
||||
var count: Int = bis.read(buff)
|
||||
while (count != -1) {
|
||||
fdos.write(buff, 0, count)
|
||||
fdos.flush()
|
||||
count = bis.read(buff)
|
||||
}
|
||||
|
||||
fdos.flush()
|
||||
bis = null
|
||||
seq = null
|
||||
doc = null
|
||||
}
|
||||
bis = new BufferedInputStream(new ByteArrayInputStream(("]").getBytes()))
|
||||
val buff: Array[Byte] = new Array[Byte](1048576)
|
||||
|
||||
var count: Int = bis.read(buff)
|
||||
while (count != -1) {
|
||||
fdos.write(buff, 0, count)
|
||||
fdos.flush()
|
||||
count = bis.read(buff)
|
||||
}
|
||||
fdos.flush()
|
||||
})
|
||||
|
||||
fdos.close()
|
||||
|
||||
val df: DataFrame = session.read.json(hdfsPathTemporary)
|
||||
|
||||
out.write(df)
|
||||
|
||||
}
|
||||
|
||||
override def setProperties(map: Map[String, Any]): Unit = {
|
||||
|
||||
def setProperties(map: Map[String, Any]): Unit = {
|
||||
cachePath=MapUtil.get(map,key="cachePath").asInstanceOf[String]
|
||||
}
|
||||
|
||||
override def getPropertyDescriptor(): List[PropertyDescriptor] ={
|
||||
override def getPropertyDescriptor(): List[PropertyDescriptor] = {
|
||||
var descriptor : List[PropertyDescriptor] = List()
|
||||
val cachePath = new PropertyDescriptor().name("cachePath").displayName("cachePath").description("Temporary Cache File Path")
|
||||
.defaultValue("/emblData").required(true)
|
||||
descriptor = cachePath :: descriptor
|
||||
descriptor
|
||||
}
|
||||
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
package cn.piflow.bundle.microorganism
|
||||
|
||||
import java.io.{BufferedInputStream, BufferedReader, ByteArrayInputStream, InputStreamReader}
|
||||
import java.io._
|
||||
|
||||
import cn.piflow.bundle.microorganism.util.ParserGff3Data
|
||||
import cn.piflow.conf.bean.PropertyDescriptor
|
||||
import cn.piflow.conf.util.ImageUtil
|
||||
import cn.piflow.conf.util.{ImageUtil, MapUtil}
|
||||
import cn.piflow.conf.{ConfigurableStop, PortEnum, StopGroup}
|
||||
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
|
||||
import org.apache.hadoop.conf.Configuration
|
||||
|
@ -19,12 +19,15 @@ class Ensembl extends ConfigurableStop{
|
|||
override val inportList: List[String] =List(PortEnum.DefaultPort.toString)
|
||||
override val outportList: List[String] = List(PortEnum.DefaultPort.toString)
|
||||
|
||||
override def setProperties(map: Map[String, Any]): Unit = {
|
||||
|
||||
var cachePath:String = _
|
||||
def setProperties(map: Map[String, Any]): Unit = {
|
||||
cachePath=MapUtil.get(map,key="cachePath").asInstanceOf[String]
|
||||
}
|
||||
|
||||
override def getPropertyDescriptor(): List[PropertyDescriptor] ={
|
||||
override def getPropertyDescriptor(): List[PropertyDescriptor] = {
|
||||
var descriptor : List[PropertyDescriptor] = List()
|
||||
val cachePath = new PropertyDescriptor().name("cachePath").displayName("cachePath").description("Temporary Cache File Path")
|
||||
.defaultValue("/ensembl").required(true)
|
||||
descriptor = cachePath :: descriptor
|
||||
descriptor
|
||||
}
|
||||
|
||||
|
@ -43,82 +46,55 @@ class Ensembl extends ConfigurableStop{
|
|||
override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
|
||||
|
||||
val session = pec.get[SparkSession]()
|
||||
|
||||
val inDf: DataFrame = in.read()
|
||||
|
||||
val configuration: Configuration = new Configuration()
|
||||
var pathStr: String = ""
|
||||
var pathStr: String =inDf.take(1)(0).get(0).asInstanceOf[String]
|
||||
val pathARR: Array[String] = pathStr.split("\\/")
|
||||
var hdfsUrl:String=""
|
||||
try{
|
||||
pathStr =inDf.take(1)(0).get(0).asInstanceOf[String]
|
||||
val pathARR: Array[String] = pathStr.split("\\/")
|
||||
|
||||
for (x <- (0 until 3)){
|
||||
hdfsUrl+=(pathARR(x) +"/")
|
||||
}
|
||||
}catch {
|
||||
case e:Exception => throw new Exception("Path error")
|
||||
for (x <- (0 until 3)){
|
||||
hdfsUrl+=(pathARR(x) +"/")
|
||||
}
|
||||
|
||||
configuration.set("fs.defaultFS",hdfsUrl)
|
||||
var fs: FileSystem = FileSystem.get(configuration)
|
||||
|
||||
val hdfsPathTemporary:String = hdfsUrl+"/ensembl_genomeParser_temporary.json"
|
||||
val path: Path = new Path(hdfsPathTemporary)
|
||||
val hdfsPathTemporary = hdfsUrl+cachePath+"/ensemblCache/ensemblCache.json"
|
||||
|
||||
val path: Path = new Path(hdfsPathTemporary)
|
||||
if(fs.exists(path)){
|
||||
fs.delete(path)
|
||||
}
|
||||
|
||||
fs.create(path).close()
|
||||
var fdos: FSDataOutputStream = fs.append(path)
|
||||
val buff: Array[Byte] = new Array[Byte](1048576)
|
||||
|
||||
val hdfsWriter: OutputStreamWriter = new OutputStreamWriter(fs.append(path))
|
||||
|
||||
val parser: ParserGff3Data = new ParserGff3Data
|
||||
|
||||
var bis: BufferedInputStream =null
|
||||
var fdis: FSDataInputStream =null
|
||||
var br: BufferedReader = null
|
||||
var sequences: RichSequenceIterator = null
|
||||
var doc: JSONObject = null
|
||||
var seq: RichSequence = null
|
||||
var jsonStr: String = ""
|
||||
var count:Int = 0
|
||||
inDf.collect().foreach(row => {
|
||||
pathStr = row.get(0).asInstanceOf[String]
|
||||
println("start parser ^^^" + pathStr)
|
||||
|
||||
fdis = fs.open(new Path(pathStr))
|
||||
br = new BufferedReader(new InputStreamReader(fdis))
|
||||
var eachStr:String=null
|
||||
|
||||
while((eachStr = br.readLine()) != null && eachStr != null ){
|
||||
doc = parser.parserGff3(eachStr)
|
||||
jsonStr = doc.toString
|
||||
if(jsonStr.length > 2){
|
||||
bis = new BufferedInputStream(new ByteArrayInputStream((jsonStr+"\n").getBytes()))
|
||||
var count: Int = bis.read(buff)
|
||||
while (count != -1) {
|
||||
fdos.write(buff, 0, count)
|
||||
fdos.flush()
|
||||
count = bis.read(buff)
|
||||
}
|
||||
fdos.flush()
|
||||
|
||||
bis.close()
|
||||
bis = null
|
||||
doc = null
|
||||
seq = null
|
||||
jsonStr = ""
|
||||
if(doc.toString.length > 2){
|
||||
count += 1
|
||||
doc.write(hdfsWriter)
|
||||
hdfsWriter.write("\n")
|
||||
}
|
||||
|
||||
}
|
||||
sequences = null
|
||||
|
||||
br.close()
|
||||
br = null
|
||||
fdis.close()
|
||||
fdis =null
|
||||
pathStr = null
|
||||
})
|
||||
|
||||
fdos.close()
|
||||
hdfsWriter.close()
|
||||
|
||||
out.write(session.read.json(hdfsPathTemporary))
|
||||
}
|
||||
|
|
|
@ -2,15 +2,14 @@ package cn.piflow.bundle.microorganism
|
|||
|
||||
import java.io._
|
||||
|
||||
|
||||
import cn.piflow.bundle.microorganism.util.{CustomIOTools, Process}
|
||||
import cn.piflow.conf.bean.PropertyDescriptor
|
||||
import cn.piflow.conf.util.ImageUtil
|
||||
import cn.piflow.conf.util.{ImageUtil, MapUtil}
|
||||
import cn.piflow.conf.{ConfigurableStop, PortEnum, StopGroup}
|
||||
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
|
||||
import org.apache.hadoop.conf.Configuration
|
||||
import org.apache.hadoop.fs.{FSDataInputStream, FSDataOutputStream, FileSystem, Path}
|
||||
import org.apache.spark.sql.{DataFrame, SparkSession}
|
||||
import org.apache.spark.sql.{DataFrame, SparkSession}
|
||||
import org.json.JSONObject
|
||||
|
||||
|
||||
|
@ -21,10 +20,10 @@ class GenBankData extends ConfigurableStop{
|
|||
val outportList: List[String] = List(PortEnum.DefaultPort.toString)
|
||||
|
||||
|
||||
var cachePath:String = _
|
||||
def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
|
||||
val spark = pec.get[SparkSession]()
|
||||
val sc = spark.sparkContext
|
||||
|
||||
val inDf= in.read()
|
||||
|
||||
val configuration: Configuration = new Configuration()
|
||||
|
@ -34,21 +33,18 @@ class GenBankData extends ConfigurableStop{
|
|||
for (x <- (0 until 3)){
|
||||
hdfsUrl+=(pathARR(x) +"/")
|
||||
}
|
||||
|
||||
configuration.set("fs.defaultFS",hdfsUrl)
|
||||
var fs: FileSystem = FileSystem.get(configuration)
|
||||
|
||||
val hdfsPathTemporary:String = hdfsUrl+"/microoCache/genbank/genbankcach.json"
|
||||
val hdfsPathTemporary = hdfsUrl+cachePath+"/genebankCache/genebankCache.json"
|
||||
|
||||
val path: Path = new Path(hdfsPathTemporary)
|
||||
if(fs.exists(path)){
|
||||
fs.delete(path)
|
||||
}
|
||||
fs.create(path).close()
|
||||
|
||||
|
||||
var fdosOut: FSDataOutputStream = fs.append(path)
|
||||
var jsonStr: String =""
|
||||
var bisIn: BufferedInputStream =null
|
||||
val hdfsWriter: OutputStreamWriter = new OutputStreamWriter(fs.append(path))
|
||||
|
||||
|
||||
inDf.collect().foreach(row=>{
|
||||
|
@ -60,48 +56,35 @@ class GenBankData extends ConfigurableStop{
|
|||
|
||||
var doc: JSONObject = null
|
||||
var count = 0
|
||||
while (sequenceIterator.hasNext) {
|
||||
count += 1
|
||||
doc = new JSONObject
|
||||
while (sequenceIterator.hasNext) {
|
||||
count += 1
|
||||
doc = new JSONObject
|
||||
|
||||
val seq = sequenceIterator.nextRichSequence()
|
||||
val seq = sequenceIterator.nextRichSequence()
|
||||
Process.processSingleSequence(seq, doc)
|
||||
|
||||
Process.processSingleSequence(seq, doc)
|
||||
doc.write(hdfsWriter)
|
||||
hdfsWriter.write("\n")
|
||||
|
||||
|
||||
bisIn = new BufferedInputStream(new ByteArrayInputStream((doc.toString+"\n").getBytes()))
|
||||
|
||||
val buff: Array[Byte] = new Array[Byte](1048576)
|
||||
var num: Int = bisIn.read(buff)
|
||||
while (num != -1) {
|
||||
fdosOut.write(buff, 0, num)
|
||||
fdosOut.flush()
|
||||
num = bisIn.read(buff)
|
||||
}
|
||||
|
||||
fdosOut.flush()
|
||||
bisIn = null
|
||||
|
||||
}
|
||||
br.close()
|
||||
fdis.close()
|
||||
})
|
||||
|
||||
fdosOut.close()
|
||||
hdfsWriter.close()
|
||||
println("start parser HDFSjsonFile")
|
||||
val df: DataFrame = spark.read.json(hdfsPathTemporary)
|
||||
|
||||
df.schema.printTreeString()
|
||||
out.write(df)
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
def setProperties(map: Map[String, Any]): Unit = {
|
||||
cachePath=MapUtil.get(map,key="cachePath").asInstanceOf[String]
|
||||
}
|
||||
|
||||
override def getPropertyDescriptor(): List[PropertyDescriptor] = {
|
||||
var descriptor : List[PropertyDescriptor] = List()
|
||||
|
||||
val cachePath = new PropertyDescriptor().name("cachePath").displayName("cachePath").description("Temporary Cache File Path")
|
||||
.defaultValue("/genbank").required(true)
|
||||
descriptor = cachePath :: descriptor
|
||||
descriptor
|
||||
}
|
||||
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
package cn.piflow.bundle.microorganism
|
||||
|
||||
import java.io.{BufferedInputStream, BufferedReader, ByteArrayInputStream, InputStreamReader}
|
||||
import java.io._
|
||||
import java.text.SimpleDateFormat
|
||||
|
||||
import cn.piflow.conf.bean.PropertyDescriptor
|
||||
import cn.piflow.conf.util.ImageUtil
|
||||
import cn.piflow.conf.util.{ImageUtil, MapUtil}
|
||||
import cn.piflow.conf.{ConfigurableStop, PortEnum, StopGroup}
|
||||
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
|
||||
import org.apache.hadoop.conf.Configuration
|
||||
|
@ -18,13 +18,13 @@ class Gene extends ConfigurableStop{
|
|||
override val inportList: List[String] =List(PortEnum.DefaultPort.toString)
|
||||
override val outportList: List[String] = List(PortEnum.DefaultPort.toString)
|
||||
|
||||
|
||||
var cachePath:String = _
|
||||
override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
|
||||
val session = pec.get[SparkSession]()
|
||||
|
||||
val inDf: DataFrame = in.read()
|
||||
val configuration: Configuration = new Configuration()
|
||||
|
||||
|
||||
val configuration: Configuration = new Configuration()
|
||||
var pathStr: String =inDf.take(1)(0).get(0).asInstanceOf[String]
|
||||
val pathARR: Array[String] = pathStr.split("\\/")
|
||||
var hdfsUrl:String=""
|
||||
|
@ -34,19 +34,15 @@ class Gene extends ConfigurableStop{
|
|||
configuration.set("fs.defaultFS",hdfsUrl)
|
||||
var fs: FileSystem = FileSystem.get(configuration)
|
||||
|
||||
val hdfsPathTemporary:String = hdfsUrl+"/Refseq_genomeParser_temporary.json"
|
||||
val path: Path = new Path(hdfsPathTemporary)
|
||||
val hdfsPathTemporary = hdfsUrl+cachePath+"/geneCache/geneCache.json"
|
||||
|
||||
val path: Path = new Path(hdfsPathTemporary)
|
||||
if(fs.exists(path)){
|
||||
fs.delete(path)
|
||||
}
|
||||
|
||||
fs.create(path).close()
|
||||
var fdos: FSDataOutputStream = fs.append(path)
|
||||
|
||||
var jsonStr: String =""
|
||||
|
||||
var bis: BufferedInputStream =null
|
||||
val hdfsWriter: OutputStreamWriter = new OutputStreamWriter(fs.append(path))
|
||||
|
||||
var names:Array[String]=Array("tax_id", "geneID", "symbol", "locus_tag", "synonyms", "dbxrefs", "chromosome", "map_location", "description", "type_of_gene",
|
||||
"symbol_from_nomenclature_authority", "full_name_from_nomenclature_authority",
|
||||
|
@ -54,7 +50,7 @@ class Gene extends ConfigurableStop{
|
|||
val format: java.text.DateFormat = new SimpleDateFormat("yyyyMMdd").asInstanceOf[java.text.DateFormat]
|
||||
val newFormat: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd")
|
||||
|
||||
var n:Int=0
|
||||
var count:Int=0
|
||||
inDf.collect().foreach(row => {
|
||||
|
||||
pathStr = row.get(0).asInstanceOf[String]
|
||||
|
@ -66,9 +62,9 @@ class Gene extends ConfigurableStop{
|
|||
var line:String=""
|
||||
var doc:JSONObject=null
|
||||
|
||||
while ((line=br.readLine()) != null){
|
||||
while ((line=br.readLine()) != null && count < 10 ){
|
||||
if( ! line.startsWith("#")){
|
||||
n += 1
|
||||
count += 1
|
||||
doc=new JSONObject()
|
||||
val tokens: Array[String] = line.split("\\\t")
|
||||
for(i <- (0 until 15)){
|
||||
|
@ -84,54 +80,28 @@ class Gene extends ConfigurableStop{
|
|||
doc.put(names(i),newFormat.format(format.parse(tokens(i))))
|
||||
}
|
||||
}
|
||||
jsonStr = doc.toString
|
||||
if (n == 1) {
|
||||
bis = new BufferedInputStream(new ByteArrayInputStream(("[" + jsonStr).getBytes()))
|
||||
} else {
|
||||
bis = new BufferedInputStream(new ByteArrayInputStream(("," + jsonStr).getBytes()))
|
||||
}
|
||||
|
||||
val buff: Array[Byte] = new Array[Byte](1048576)
|
||||
|
||||
var count: Int = bis.read(buff)
|
||||
while (count != -1) {
|
||||
fdos.write(buff, 0, count)
|
||||
fdos.flush()
|
||||
count = bis.read(buff)
|
||||
}
|
||||
fdos.flush()
|
||||
bis = null
|
||||
doc = null
|
||||
doc.write(hdfsWriter)
|
||||
hdfsWriter.write("\n")
|
||||
}
|
||||
}
|
||||
|
||||
br.close()
|
||||
fdis.close()
|
||||
})
|
||||
bis = new BufferedInputStream(new ByteArrayInputStream(("]").getBytes()))
|
||||
val buff: Array[Byte] = new Array[Byte](1048576)
|
||||
|
||||
var count: Int = bis.read(buff)
|
||||
while (count != -1) {
|
||||
fdos.write(buff, 0, count)
|
||||
fdos.flush()
|
||||
count = bis.read(buff)
|
||||
}
|
||||
fdos.flush()
|
||||
|
||||
fdos.close()
|
||||
|
||||
hdfsWriter.close()
|
||||
val df: DataFrame = session.read.json(hdfsPathTemporary)
|
||||
|
||||
out.write(df)
|
||||
|
||||
|
||||
}
|
||||
|
||||
override def setProperties(map: Map[String, Any]): Unit = {
|
||||
|
||||
def setProperties(map: Map[String, Any]): Unit = {
|
||||
cachePath=MapUtil.get(map,key="cachePath").asInstanceOf[String]
|
||||
}
|
||||
|
||||
override def getPropertyDescriptor(): List[PropertyDescriptor] = {
|
||||
var descriptor : List[PropertyDescriptor] = List()
|
||||
val cachePath = new PropertyDescriptor().name("cachePath").displayName("cachePath").description("Temporary Cache File Path")
|
||||
.defaultValue("/gene").required(true)
|
||||
descriptor = cachePath :: descriptor
|
||||
descriptor
|
||||
}
|
||||
|
||||
|
|
|
@ -35,21 +35,18 @@ class GoData extends ConfigurableStop{
|
|||
for (x <- (0 until 3)){
|
||||
hdfsUrl+=(pathARR(x) +"/")
|
||||
}
|
||||
|
||||
configuration.set("fs.defaultFS",hdfsUrl)
|
||||
var fs: FileSystem = FileSystem.get(configuration)
|
||||
|
||||
val hdfsPathJsonCache = hdfsUrl+cachePath+"/godataCache/godataCache.json"
|
||||
val hdfsPathTemporary = hdfsUrl+cachePath+"/godataCache/godataCache.json"
|
||||
|
||||
val path: Path = new Path(hdfsPathJsonCache)
|
||||
val path: Path = new Path(hdfsPathTemporary)
|
||||
if(fs.exists(path)){
|
||||
fs.delete(path)
|
||||
}
|
||||
fs.create(path).close()
|
||||
|
||||
var fdosOut: FSDataOutputStream = fs.append(path)
|
||||
var jsonStr: String =""
|
||||
var bisIn: BufferedInputStream =null
|
||||
val hdfsWriter: OutputStreamWriter = new OutputStreamWriter(fs.append(path))
|
||||
|
||||
inDf.collect().foreach(row => {
|
||||
pathStr = row.get(0).asInstanceOf[String]
|
||||
|
@ -65,7 +62,7 @@ class GoData extends ConfigurableStop{
|
|||
}
|
||||
var obj = new JSONObject()
|
||||
var count= 0
|
||||
while ((line = br.readLine()) !=null && line !=null ){
|
||||
while ((line = br.readLine()) !=null && line !=null && count<10){
|
||||
val m: Matcher = tv.matcher(line)
|
||||
if (line.startsWith("[")){
|
||||
if (line .equals("[Term]")){
|
||||
|
@ -89,28 +86,20 @@ class GoData extends ConfigurableStop{
|
|||
}
|
||||
count += 1
|
||||
|
||||
bisIn = new BufferedInputStream(new ByteArrayInputStream((obj.toString+"\n").getBytes()))
|
||||
|
||||
val buff: Array[Byte] = new Array[Byte](1048576)
|
||||
var num: Int = bisIn.read(buff)
|
||||
while (num != -1) {
|
||||
fdosOut.write(buff, 0, num)
|
||||
fdosOut.flush()
|
||||
num = bisIn.read(buff)
|
||||
}
|
||||
fdosOut.flush()
|
||||
bisIn = null
|
||||
|
||||
obj.write(hdfsWriter)
|
||||
hdfsWriter.write("\n")
|
||||
|
||||
obj= new JSONObject()
|
||||
}
|
||||
}
|
||||
br.close()
|
||||
fdis.close()
|
||||
})
|
||||
hdfsWriter.close()
|
||||
|
||||
fdosOut.close()
|
||||
|
||||
println("start parser HDFSjsonFile")
|
||||
val df: DataFrame = spark.read.json(hdfsPathJsonCache)
|
||||
val df: DataFrame = spark.read.json(hdfsPathTemporary)
|
||||
out.write(df)
|
||||
|
||||
}
|
||||
|
@ -120,7 +109,8 @@ class GoData extends ConfigurableStop{
|
|||
|
||||
override def getPropertyDescriptor(): List[PropertyDescriptor] = {
|
||||
var descriptor : List[PropertyDescriptor] = List()
|
||||
val cachePath = new PropertyDescriptor().name("cachePath").displayName("cachePath").defaultValue("").required(true)
|
||||
val cachePath = new PropertyDescriptor().name("cachePath").displayName("cachePath").description("Temporary Cache File Path")
|
||||
.defaultValue("/goData").required(true)
|
||||
descriptor = cachePath :: descriptor
|
||||
descriptor
|
||||
}
|
||||
|
|
|
@ -35,16 +35,15 @@ class InterproData extends ConfigurableStop{
|
|||
configuration.set("fs.defaultFS",hdfsUrl)
|
||||
var fs: FileSystem = FileSystem.get(configuration)
|
||||
|
||||
val hdfsPathJsonCache = hdfsUrl+cachePath+"/interproDataCatch/interproDataCatch.json"
|
||||
val path: Path = new Path(hdfsPathJsonCache)
|
||||
val hdfsPathTemporary = hdfsUrl+cachePath+"/interproCache/interproCache.json"
|
||||
|
||||
val path: Path = new Path(hdfsPathTemporary)
|
||||
if(fs.exists(path)){
|
||||
fs.delete(path)
|
||||
}
|
||||
fs.create(path).close()
|
||||
|
||||
var fdosOut: FSDataOutputStream = fs.append(path)
|
||||
var jsonStr: String =""
|
||||
var bisIn: BufferedInputStream =null
|
||||
val hdfsWriter: OutputStreamWriter = new OutputStreamWriter(fs.append(path))
|
||||
|
||||
inDf.collect().foreach(row => {
|
||||
pathStr = row.get(0).asInstanceOf[String]
|
||||
|
@ -76,28 +75,20 @@ class InterproData extends ConfigurableStop{
|
|||
doc.remove("pub_list")
|
||||
}
|
||||
|
||||
|
||||
bisIn = new BufferedInputStream(new ByteArrayInputStream((doc.toString+"\n").getBytes()))
|
||||
|
||||
val buff: Array[Byte] = new Array[Byte](1048576)
|
||||
var num: Int = bisIn.read(buff)
|
||||
while (num != -1) {
|
||||
fdosOut.write(buff, 0, num)
|
||||
fdosOut.flush()
|
||||
num = bisIn.read(buff)
|
||||
}
|
||||
fdosOut.flush()
|
||||
bisIn = null
|
||||
doc.write(hdfsWriter)
|
||||
hdfsWriter.write("\n")
|
||||
|
||||
xml = ""
|
||||
}
|
||||
}
|
||||
br.close()
|
||||
fdis.close()
|
||||
})
|
||||
|
||||
fdosOut.close()
|
||||
hdfsWriter.close()
|
||||
|
||||
println("start parser HDFSjsonFile")
|
||||
val df: DataFrame = spark.read.json(hdfsPathJsonCache)
|
||||
val df: DataFrame = spark.read.json(hdfsPathTemporary)
|
||||
|
||||
out.write(df)
|
||||
|
||||
|
@ -109,7 +100,8 @@ class InterproData extends ConfigurableStop{
|
|||
|
||||
override def getPropertyDescriptor(): List[PropertyDescriptor] = {
|
||||
var descriptor : List[PropertyDescriptor] = List()
|
||||
val cachePath = new PropertyDescriptor().name("cachePath").displayName("cachePath").defaultValue("").required(true)
|
||||
val cachePath = new PropertyDescriptor().name("cachePath").displayName("cachePath").description("Temporary Cache File Path")
|
||||
.defaultValue("/interpro").required(true)
|
||||
descriptor = cachePath :: descriptor
|
||||
descriptor
|
||||
}
|
||||
|
|
|
@ -7,7 +7,7 @@ import java.util.Locale
|
|||
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
|
||||
import cn.piflow.conf.{ConfigurableStop, PortEnum, StopGroup}
|
||||
import cn.piflow.conf.bean.PropertyDescriptor
|
||||
import cn.piflow.conf.util.ImageUtil
|
||||
import cn.piflow.conf.util.{ImageUtil, MapUtil}
|
||||
import org.apache.hadoop.conf.Configuration
|
||||
import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path}
|
||||
import org.apache.spark.sql.{DataFrame, SparkSession}
|
||||
|
@ -19,11 +19,12 @@ class MedlineData extends ConfigurableStop{
|
|||
override val inportList: List[String] =List(PortEnum.DefaultPort.toString)
|
||||
override val outportList: List[String] = List(PortEnum.DefaultPort.toString)
|
||||
|
||||
var cachePath:String = _
|
||||
|
||||
var docName = "PubmedArticle"
|
||||
val formatter = new SimpleDateFormat("yyyy-MM-dd")
|
||||
val format = new SimpleDateFormat("dd-MM-yyyy")
|
||||
val format_english = new SimpleDateFormat("dd-MMM-yyyy", Locale.ENGLISH)
|
||||
var docName = "PubmedArticle"
|
||||
val formatter = new SimpleDateFormat("yyyy-MM-dd")
|
||||
val format = new SimpleDateFormat("dd-MM-yyyy")
|
||||
val format_english = new SimpleDateFormat("dd-MMM-yyyy", Locale.ENGLISH)
|
||||
|
||||
override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
|
||||
val inDf: DataFrame = in.read()
|
||||
|
@ -38,7 +39,7 @@ class MedlineData extends ConfigurableStop{
|
|||
configuration.set("fs.defaultFS",hdfsUrl)
|
||||
var fs: FileSystem = FileSystem.get(configuration)
|
||||
|
||||
val hdfsPathTemporary:String = hdfsUrl+"/yqd/test/medline/medlineAll.json"
|
||||
val hdfsPathTemporary = hdfsUrl+cachePath+"/medlineCache/medlineCache.json"
|
||||
val path: Path = new Path(hdfsPathTemporary)
|
||||
if(fs.exists(path)){
|
||||
fs.delete(path)
|
||||
|
@ -56,27 +57,23 @@ class MedlineData extends ConfigurableStop{
|
|||
fileNum += 1
|
||||
|
||||
pathStr = row.get(0).asInstanceOf[String]
|
||||
println(fileNum+"-----start parser ^^^" + pathStr)
|
||||
fdis = fs.open(new Path(pathStr))
|
||||
br = new BufferedReader(new InputStreamReader(fdis))
|
||||
|
||||
var i = 0
|
||||
var eachLine: String= ""
|
||||
while (i < 3) {
|
||||
eachLine = br.readLine()
|
||||
i += 1
|
||||
}
|
||||
|
||||
var xml = ""
|
||||
while ((eachLine = br.readLine) != null && eachLine != null ) {
|
||||
|
||||
xml += eachLine
|
||||
|
||||
if (eachLine.indexOf("</"+docName+">")!= -1){
|
||||
count += 1
|
||||
|
||||
doc = XML.toJSONObject(xml).getJSONObject(docName).optJSONObject("MedlineCitation")
|
||||
println(count)
|
||||
// println(doc.toString)
|
||||
|
||||
|
||||
if (doc.optJSONObject("DateCreated") != null) {
|
||||
val dateCreated = doc.getJSONObject("DateCreated")
|
||||
|
@ -130,9 +127,6 @@ class MedlineData extends ConfigurableStop{
|
|||
if (pubDate.opt("Year") != null) doc.put("PubYear", pubDate.get("Year"))
|
||||
|
||||
if (pubDate.opt("Year") != null && pubDate.opt("Month") != null && pubDate.opt("Day") != null) {
|
||||
|
||||
|
||||
|
||||
var month = pubDate.get("Month")
|
||||
if (month.toString.contains("01") || month.toString.contains("1")) month = "Jan"
|
||||
if (month.toString.contains("02") || month.toString.contains("2")) month = "Feb"
|
||||
|
@ -147,10 +141,7 @@ class MedlineData extends ConfigurableStop{
|
|||
if (month.toString.contains("11")) month = "Nov"
|
||||
if (month.toString.contains("12")) month = "Dec"
|
||||
|
||||
|
||||
val date = pubDate.get("Day") + "-" +month + "-" + pubDate.get("Year")
|
||||
|
||||
// println(date+"@@@@@@@@@@@")
|
||||
doc.put("PubDate", formatter.format(format_english.parse(date)))
|
||||
}
|
||||
}
|
||||
|
@ -169,17 +160,13 @@ class MedlineData extends ConfigurableStop{
|
|||
hdfsWriter.write("\n")
|
||||
|
||||
xml = ""
|
||||
|
||||
}
|
||||
}
|
||||
br.close()
|
||||
fdis.close()
|
||||
})
|
||||
hdfsWriter.close()
|
||||
|
||||
val df: DataFrame = pec.get[SparkSession]().read.json(hdfsPathTemporary)
|
||||
df.schema.printTreeString()
|
||||
// println(df.count())
|
||||
out.write(df)
|
||||
|
||||
}
|
||||
|
@ -199,9 +186,7 @@ class MedlineData extends ConfigurableStop{
|
|||
return findJSONObject(objKey, obj.getJSONArray(key).getJSONObject(i))
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
return null
|
||||
|
@ -210,15 +195,17 @@ class MedlineData extends ConfigurableStop{
|
|||
|
||||
|
||||
|
||||
override def setProperties(map: Map[String, Any]): Unit = {
|
||||
|
||||
def setProperties(map: Map[String, Any]): Unit = {
|
||||
cachePath=MapUtil.get(map,key="cachePath").asInstanceOf[String]
|
||||
}
|
||||
|
||||
override def getPropertyDescriptor(): List[PropertyDescriptor] ={
|
||||
var descriptor = List()
|
||||
override def getPropertyDescriptor(): List[PropertyDescriptor] = {
|
||||
var descriptor : List[PropertyDescriptor] = List()
|
||||
val cachePath = new PropertyDescriptor().name("cachePath").displayName("cachePath").description("Temporary Cache File Path")
|
||||
.defaultValue("/medline").required(true)
|
||||
descriptor = cachePath :: descriptor
|
||||
descriptor
|
||||
}
|
||||
|
||||
override def getIcon(): Array[Byte] = {
|
||||
ImageUtil.getImage("icon/microorganism/MedlineData.png")
|
||||
}
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
package cn.piflow.bundle.microorganism
|
||||
|
||||
import java.io.{BufferedInputStream, BufferedReader, ByteArrayInputStream, InputStreamReader}
|
||||
import java.io._
|
||||
|
||||
import cn.piflow.bundle.microorganism.util.{CustomIOTools, Process}
|
||||
import cn.piflow.conf.bean.PropertyDescriptor
|
||||
import cn.piflow.conf.util.ImageUtil
|
||||
import cn.piflow.conf.util.{ImageUtil, MapUtil}
|
||||
import cn.piflow.conf.{ConfigurableStop, PortEnum, StopGroup}
|
||||
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
|
||||
import org.apache.hadoop.conf.Configuration
|
||||
|
@ -14,118 +14,83 @@ import org.biojavax.bio.seq.{RichSequence, RichSequenceIterator}
|
|||
import org.json.JSONObject
|
||||
|
||||
|
||||
|
||||
class MicrobeGenomeData extends ConfigurableStop{
|
||||
override val authorEmail: String = "yangqidong@cnic.cn"
|
||||
override val description: String = "Parse MicrobeGenome data"
|
||||
override val inportList: List[String] =List(PortEnum.DefaultPort.toString)
|
||||
override val outportList: List[String] = List(PortEnum.DefaultPort.toString)
|
||||
|
||||
var cachePath:String = _
|
||||
override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
|
||||
val session = pec.get[SparkSession]()
|
||||
|
||||
val spark = pec.get[SparkSession]()
|
||||
val inDf: DataFrame = in.read()
|
||||
|
||||
val configuration: Configuration = new Configuration()
|
||||
var pathStr: String = ""
|
||||
var pathStr: String =inDf.take(1)(0).get(0).asInstanceOf[String]
|
||||
val pathARR: Array[String] = pathStr.split("\\/")
|
||||
var hdfsUrl:String=""
|
||||
try{
|
||||
pathStr =inDf.take(1)(0).get(0).asInstanceOf[String]
|
||||
val pathARR: Array[String] = pathStr.split("\\/")
|
||||
|
||||
for (x <- (0 until 3)){
|
||||
hdfsUrl+=(pathARR(x) +"/")
|
||||
}
|
||||
}catch {
|
||||
case e:Exception => throw new Exception("Path error")
|
||||
for (x <- (0 until 3)){
|
||||
hdfsUrl+=(pathARR(x) +"/")
|
||||
}
|
||||
|
||||
configuration.set("fs.defaultFS",hdfsUrl)
|
||||
var fs: FileSystem = FileSystem.get(configuration)
|
||||
|
||||
val hdfsPathTemporary:String = hdfsUrl+"/NCBI_Microbe_genome_genomeParser_temporary.json"
|
||||
val path: Path = new Path(hdfsPathTemporary)
|
||||
val hdfsPathTemporary = hdfsUrl+cachePath+"/microbeGenomeCache/microbeGenomeCache.json"
|
||||
|
||||
val path: Path = new Path(hdfsPathTemporary)
|
||||
if(fs.exists(path)){
|
||||
fs.delete(path)
|
||||
}
|
||||
|
||||
fs.create(path).close()
|
||||
var fdos: FSDataOutputStream = fs.append(path)
|
||||
val buff: Array[Byte] = new Array[Byte](1048576)
|
||||
|
||||
var bis: BufferedInputStream =null
|
||||
val hdfsWriter: OutputStreamWriter = new OutputStreamWriter(fs.append(path))
|
||||
|
||||
var fdis: FSDataInputStream =null
|
||||
var br: BufferedReader = null
|
||||
var sequences: RichSequenceIterator = null
|
||||
var doc: JSONObject = null
|
||||
var seq: RichSequence = null
|
||||
var jsonStr: String = ""
|
||||
var n:Int=0
|
||||
|
||||
var count:Int=0
|
||||
inDf.collect().foreach(row => {
|
||||
|
||||
pathStr = row.get(0).asInstanceOf[String]
|
||||
println("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! start parser ^^^" + pathStr)
|
||||
|
||||
fdis = fs.open(new Path(pathStr))
|
||||
br = new BufferedReader(new InputStreamReader(fdis))
|
||||
sequences = CustomIOTools.IOTools.readGenbankProtein(br, null)
|
||||
|
||||
while (sequences.hasNext) {
|
||||
n += 1
|
||||
count += 1
|
||||
doc = new JSONObject()
|
||||
seq = sequences.nextRichSequence()
|
||||
Process.processSingleSequence(seq,doc)
|
||||
Process.processSingleSequence(seq, doc)
|
||||
|
||||
jsonStr = doc.toString
|
||||
doc.write(hdfsWriter)
|
||||
hdfsWriter.write("\n")
|
||||
|
||||
if (n == 1) {
|
||||
bis = new BufferedInputStream(new ByteArrayInputStream(("[" + jsonStr).getBytes()))
|
||||
} else {
|
||||
bis = new BufferedInputStream(new ByteArrayInputStream(("," + jsonStr).getBytes()))
|
||||
}
|
||||
var count: Int = bis.read(buff)
|
||||
while (count != -1) {
|
||||
fdos.write(buff, 0, count)
|
||||
fdos.flush()
|
||||
count = bis.read(buff)
|
||||
}
|
||||
fdos.flush()
|
||||
|
||||
bis.close()
|
||||
bis = null
|
||||
doc = null
|
||||
seq = null
|
||||
jsonStr = ""
|
||||
|
||||
}
|
||||
sequences = null
|
||||
br.close()
|
||||
br = null
|
||||
fdis.close()
|
||||
fdis =null
|
||||
pathStr = null
|
||||
|
||||
}
|
||||
br.close()
|
||||
fdis.close()
|
||||
})
|
||||
bis = new BufferedInputStream(new ByteArrayInputStream(("]").getBytes()))
|
||||
|
||||
var count: Int = bis.read(buff)
|
||||
while (count != -1) {
|
||||
fdos.write(buff, 0, count)
|
||||
fdos.flush()
|
||||
count = bis.read(buff)
|
||||
}
|
||||
fdos.flush()
|
||||
bis.close()
|
||||
fdos.close()
|
||||
|
||||
out.write(session.read.json(hdfsPathTemporary))
|
||||
hdfsWriter.close()
|
||||
|
||||
println("start parser HDFSjsonFile")
|
||||
val df: DataFrame = spark.read.json(hdfsPathTemporary)
|
||||
out.write(df)
|
||||
}
|
||||
|
||||
override def setProperties(map: Map[String, Any]): Unit = {
|
||||
|
||||
def setProperties(map: Map[String, Any]): Unit = {
|
||||
cachePath=MapUtil.get(map,key="cachePath").asInstanceOf[String]
|
||||
}
|
||||
|
||||
override def getPropertyDescriptor(): List[PropertyDescriptor] ={
|
||||
override def getPropertyDescriptor(): List[PropertyDescriptor] = {
|
||||
var descriptor : List[PropertyDescriptor] = List()
|
||||
val cachePath = new PropertyDescriptor().name("cachePath").displayName("cachePath").description("Temporary Cache File Path")
|
||||
.defaultValue("/microbeGenome").required(true)
|
||||
descriptor = cachePath :: descriptor
|
||||
descriptor
|
||||
}
|
||||
|
||||
|
|
|
@ -4,7 +4,7 @@ import java.io._
|
|||
|
||||
import cn.piflow.bundle.microorganism.util.PDB
|
||||
import cn.piflow.conf.bean.PropertyDescriptor
|
||||
import cn.piflow.conf.util.ImageUtil
|
||||
import cn.piflow.conf.util.{ImageUtil, MapUtil}
|
||||
import cn.piflow.conf.{ConfigurableStop, PortEnum, StopGroup}
|
||||
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
|
||||
import org.apache.hadoop.conf.Configuration
|
||||
|
@ -19,105 +19,61 @@ class PDBData extends ConfigurableStop{
|
|||
override val inportList: List[String] =List(PortEnum.DefaultPort.toString)
|
||||
override val outportList: List[String] = List(PortEnum.DefaultPort.toString)
|
||||
|
||||
|
||||
var cachePath:String = _
|
||||
override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
|
||||
|
||||
val session = pec.get[SparkSession]()
|
||||
|
||||
val inDf: DataFrame = in.read()
|
||||
|
||||
val configuration: Configuration = new Configuration()
|
||||
var pathStr: String = ""
|
||||
var pathStr: String =inDf.take(1)(0).get(0).asInstanceOf[String]
|
||||
val pathARR: Array[String] = pathStr.split("\\/")
|
||||
var hdfsUrl:String=""
|
||||
try{
|
||||
pathStr =inDf.take(1)(0).get(0).asInstanceOf[String]
|
||||
val pathARR: Array[String] = pathStr.split("\\/")
|
||||
|
||||
for (x <- (0 until 3)){
|
||||
hdfsUrl+=(pathARR(x) +"/")
|
||||
}
|
||||
}catch {
|
||||
case e:Exception => throw new Exception("Path error")
|
||||
for (x <- (0 until 3)){
|
||||
hdfsUrl+=(pathARR(x) +"/")
|
||||
}
|
||||
|
||||
configuration.set("fs.defaultFS",hdfsUrl)
|
||||
var fs: FileSystem = FileSystem.get(configuration)
|
||||
|
||||
val hdfsPathTemporary:String = hdfsUrl+"/Refseq_genomeParser_temporary.json"
|
||||
val path: Path = new Path(hdfsPathTemporary)
|
||||
val hdfsPathTemporary = hdfsUrl+cachePath+"/PDBCache/PDBCache.json"
|
||||
|
||||
val path: Path = new Path(hdfsPathTemporary)
|
||||
if(fs.exists(path)){
|
||||
fs.delete(path)
|
||||
}
|
||||
|
||||
fs.create(path).close()
|
||||
var fdos: FSDataOutputStream = fs.append(path)
|
||||
val buff: Array[Byte] = new Array[Byte](1048576)
|
||||
|
||||
var bis: BufferedInputStream =null
|
||||
var fdis: FSDataInputStream =null
|
||||
var br: BufferedReader = null
|
||||
var sequences: RichSequenceIterator = null
|
||||
val hdfsWriter: OutputStreamWriter = new OutputStreamWriter(fs.append(path))
|
||||
|
||||
var doc: JSONObject = null
|
||||
var seq: RichSequence = null
|
||||
var pdb: PDB = null
|
||||
var jsonStr: String = ""
|
||||
var n:Int=0
|
||||
var count:Int=0
|
||||
inDf.collect().foreach(row => {
|
||||
count += 1
|
||||
pathStr = row.get(0).asInstanceOf[String]
|
||||
|
||||
pdb = new PDB(pathStr,fs)
|
||||
doc = pdb.getDoc
|
||||
|
||||
jsonStr = doc.toString
|
||||
n +=1
|
||||
doc.write(hdfsWriter)
|
||||
hdfsWriter.write("\n")
|
||||
|
||||
if (n == 1) {
|
||||
bis = new BufferedInputStream(new ByteArrayInputStream(("[" + jsonStr).getBytes()))
|
||||
} else {
|
||||
bis = new BufferedInputStream(new ByteArrayInputStream(("," + jsonStr).getBytes()))
|
||||
}
|
||||
var count: Int = bis.read(buff)
|
||||
while (count != -1) {
|
||||
fdos.write(buff, 0, count)
|
||||
fdos.flush()
|
||||
count = bis.read(buff)
|
||||
}
|
||||
fdos.flush()
|
||||
|
||||
bis = null
|
||||
doc = null
|
||||
seq = null
|
||||
jsonStr = ""
|
||||
sequences = null
|
||||
br = null
|
||||
fdis =null
|
||||
pathStr = null
|
||||
pdb = null
|
||||
})
|
||||
bis = new BufferedInputStream(new ByteArrayInputStream(("]").getBytes()))
|
||||
|
||||
var count: Int = bis.read(buff)
|
||||
while (count != -1) {
|
||||
fdos.write(buff, 0, count)
|
||||
fdos.flush()
|
||||
count = bis.read(buff)
|
||||
}
|
||||
fdos.flush()
|
||||
bis.close()
|
||||
fdos.close()
|
||||
hdfsWriter.close()
|
||||
|
||||
val df: DataFrame = session.read.json(hdfsPathTemporary)
|
||||
|
||||
out.write(df)
|
||||
}
|
||||
|
||||
|
||||
override def setProperties(map: Map[String, Any]): Unit = {
|
||||
|
||||
def setProperties(map: Map[String, Any]): Unit = {
|
||||
cachePath=MapUtil.get(map,key="cachePath").asInstanceOf[String]
|
||||
}
|
||||
|
||||
override def getPropertyDescriptor(): List[PropertyDescriptor] = {
|
||||
var descriptor : List[PropertyDescriptor] = List()
|
||||
val cachePath = new PropertyDescriptor().name("cachePath").displayName("cachePath").description("Temporary Cache File Path")
|
||||
.defaultValue("/PDB").required(true)
|
||||
descriptor = cachePath :: descriptor
|
||||
descriptor
|
||||
}
|
||||
|
||||
|
|
|
@ -5,24 +5,30 @@ import java.io.{BufferedReader, InputStreamReader, OutputStreamWriter}
|
|||
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
|
||||
import cn.piflow.conf.{ConfigurableStop, PortEnum, StopGroup}
|
||||
import cn.piflow.conf.bean.PropertyDescriptor
|
||||
import cn.piflow.conf.util.ImageUtil
|
||||
import cn.piflow.conf.util.{ImageUtil, MapUtil}
|
||||
import org.apache.hadoop.conf.Configuration
|
||||
import org.apache.hadoop.fs.{FSDataInputStream, FSDataOutputStream, FileSystem, Path}
|
||||
import org.apache.spark.sql.{DataFrame, SparkSession}
|
||||
import org.json.JSONObject
|
||||
|
||||
|
||||
class Pathway extends ConfigurableStop{
|
||||
override val authorEmail: String = "yangqidong@cnic.cn"
|
||||
override val description: String = "Parse Pathway data"
|
||||
override val inportList: List[String] =List(PortEnum.DefaultPort.toString)
|
||||
override val outportList: List[String] = List(PortEnum.DefaultPort.toString)
|
||||
|
||||
override def setProperties(map: Map[String, Any]): Unit = {
|
||||
|
||||
var cachePath:String = _
|
||||
def setProperties(map: Map[String, Any]): Unit = {
|
||||
cachePath=MapUtil.get(map,key="cachePath").asInstanceOf[String]
|
||||
}
|
||||
|
||||
override def getPropertyDescriptor(): List[PropertyDescriptor] ={
|
||||
override def getPropertyDescriptor(): List[PropertyDescriptor] = {
|
||||
var descriptor : List[PropertyDescriptor] = List()
|
||||
val cachePath = new PropertyDescriptor().name("cachePath").displayName("cachePath").description("Temporary Cache File Path")
|
||||
.defaultValue("/pathway").required(true)
|
||||
descriptor = cachePath :: descriptor
|
||||
descriptor
|
||||
}
|
||||
|
||||
|
@ -51,8 +57,8 @@ class Pathway extends ConfigurableStop{
|
|||
configuration.set("fs.defaultFS",hdfsUrl)
|
||||
var fs: FileSystem = FileSystem.get(configuration)
|
||||
|
||||
// "/biosampleCache/biosampleCache.json"
|
||||
val hdfsPathTemporary:String = hdfsUrl+"/yqd/test/kegg/Refseq_genomeParser_temporary2.json"
|
||||
|
||||
val hdfsPathTemporary = hdfsUrl+cachePath+"/pathwayCache/pathwayCache.json"
|
||||
val path: Path = new Path(hdfsPathTemporary)
|
||||
if(fs.exists(path)){
|
||||
fs.delete(path)
|
||||
|
@ -67,17 +73,15 @@ class Pathway extends ConfigurableStop{
|
|||
|
||||
inDf.collect().foreach(row => {
|
||||
pathStr = row.get(0).asInstanceOf[String]
|
||||
println("start parser ^^^" + pathStr)
|
||||
|
||||
fdis = fs.open(new Path(pathStr))
|
||||
br = new BufferedReader(new InputStreamReader(fdis))
|
||||
var count = 0
|
||||
while (hasAnotherSequence) {
|
||||
count += 1
|
||||
println(count)
|
||||
doc = new JSONObject
|
||||
hasAnotherSequence = util.KeggPathway.process(br, doc)
|
||||
|
||||
|
||||
doc.write(hdfsWriter)
|
||||
hdfsWriter.write("\n")
|
||||
}
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
package cn.piflow.bundle.microorganism
|
||||
|
||||
import java.io.{BufferedInputStream, BufferedReader, ByteArrayInputStream, InputStreamReader}
|
||||
import java.io._
|
||||
|
||||
import cn.piflow.bundle.microorganism.util.Pfam
|
||||
import cn.piflow.conf.bean.PropertyDescriptor
|
||||
import cn.piflow.conf.util.ImageUtil
|
||||
import cn.piflow.conf.util.{ImageUtil, MapUtil}
|
||||
import cn.piflow.conf.{ConfigurableStop, PortEnum, StopGroup}
|
||||
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
|
||||
import org.apache.hadoop.conf.Configuration
|
||||
|
@ -20,108 +20,71 @@ class PfamData extends ConfigurableStop{
|
|||
override val outportList: List[String] = List(PortEnum.DefaultPort.toString)
|
||||
|
||||
|
||||
var cachePath:String = _
|
||||
override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
|
||||
val session = pec.get[SparkSession]()
|
||||
|
||||
val inDf: DataFrame = in.read()
|
||||
|
||||
val configuration: Configuration = new Configuration()
|
||||
var pathStr: String = ""
|
||||
var pathStr: String =inDf.take(1)(0).get(0).asInstanceOf[String]
|
||||
val pathARR: Array[String] = pathStr.split("\\/")
|
||||
var hdfsUrl:String=""
|
||||
try{
|
||||
pathStr =inDf.take(1)(0).get(0).asInstanceOf[String]
|
||||
val pathARR: Array[String] = pathStr.split("\\/")
|
||||
|
||||
for (x <- (0 until 3)){
|
||||
hdfsUrl+=(pathARR(x) +"/")
|
||||
}
|
||||
}catch {
|
||||
case e:Exception => throw new Exception("Path error")
|
||||
for (x <- (0 until 3)){
|
||||
hdfsUrl+=(pathARR(x) +"/")
|
||||
}
|
||||
|
||||
configuration.set("fs.defaultFS",hdfsUrl)
|
||||
var fs: FileSystem = FileSystem.get(configuration)
|
||||
|
||||
val hdfsPathTemporary:String = hdfsUrl+"/Refseq_genomeParser_temporary.json"
|
||||
val path: Path = new Path(hdfsPathTemporary)
|
||||
val hdfsPathTemporary = hdfsUrl+cachePath+"/pfamCache/pfamCache.json"
|
||||
|
||||
val path: Path = new Path(hdfsPathTemporary)
|
||||
if(fs.exists(path)){
|
||||
fs.delete(path)
|
||||
}
|
||||
|
||||
fs.create(path).close()
|
||||
var fdos: FSDataOutputStream = fs.append(path)
|
||||
val buff: Array[Byte] = new Array[Byte](1048576)
|
||||
|
||||
var bis: BufferedInputStream =null
|
||||
val hdfsWriter: OutputStreamWriter = new OutputStreamWriter(fs.append(path))
|
||||
|
||||
var fdis: FSDataInputStream =null
|
||||
var br: BufferedReader = null
|
||||
var sequences: RichSequenceIterator = null
|
||||
|
||||
var doc: JSONObject = null
|
||||
var seq: RichSequence = null
|
||||
var hasAnotherSequence : Boolean=true
|
||||
var jsonStr: String = ""
|
||||
var n:Int=0
|
||||
|
||||
var count:Int=0
|
||||
inDf.collect().foreach(row => {
|
||||
pathStr = row.get(0).asInstanceOf[String]
|
||||
fdis = fs.open(new Path(pathStr))
|
||||
br = new BufferedReader(new InputStreamReader(fdis))
|
||||
|
||||
while( hasAnotherSequence && n < 1000 ){
|
||||
n += 1
|
||||
|
||||
while( hasAnotherSequence ){
|
||||
count += 1
|
||||
doc = new JSONObject()
|
||||
hasAnotherSequence = Pfam.process(br,doc)
|
||||
|
||||
jsonStr = doc.toString
|
||||
doc.write(hdfsWriter)
|
||||
hdfsWriter.write("\n")
|
||||
|
||||
if (n == 1) {
|
||||
bis = new BufferedInputStream(new ByteArrayInputStream(("[" + jsonStr).getBytes()))
|
||||
} else {
|
||||
bis = new BufferedInputStream(new ByteArrayInputStream(("," + jsonStr).getBytes()))
|
||||
}
|
||||
var count: Int = bis.read(buff)
|
||||
while (count != -1) {
|
||||
fdos.write(buff, 0, count)
|
||||
fdos.flush()
|
||||
count = bis.read(buff)
|
||||
}
|
||||
fdos.flush()
|
||||
|
||||
bis = null
|
||||
doc = null
|
||||
seq = null
|
||||
jsonStr = ""
|
||||
}
|
||||
|
||||
sequences = null
|
||||
br = null
|
||||
fdis =null
|
||||
pathStr = null
|
||||
br.close()
|
||||
fdis.close()
|
||||
})
|
||||
bis = new BufferedInputStream(new ByteArrayInputStream(("]").getBytes()))
|
||||
|
||||
var count: Int = bis.read(buff)
|
||||
while (count != -1) {
|
||||
fdos.write(buff, 0, count)
|
||||
fdos.flush()
|
||||
count = bis.read(buff)
|
||||
}
|
||||
fdos.flush()
|
||||
bis.close()
|
||||
fdos.close()
|
||||
hdfsWriter.close()
|
||||
|
||||
val df: DataFrame = session.read.json(hdfsPathTemporary)
|
||||
|
||||
out.write(df)
|
||||
}
|
||||
|
||||
|
||||
override def setProperties(map: Map[String, Any]): Unit = {
|
||||
|
||||
def setProperties(map: Map[String, Any]): Unit = {
|
||||
cachePath=MapUtil.get(map,key="cachePath").asInstanceOf[String]
|
||||
}
|
||||
|
||||
override def getPropertyDescriptor(): List[PropertyDescriptor] ={
|
||||
override def getPropertyDescriptor(): List[PropertyDescriptor] = {
|
||||
var descriptor : List[PropertyDescriptor] = List()
|
||||
val cachePath = new PropertyDescriptor().name("cachePath").displayName("cachePath").description("Temporary Cache File Path")
|
||||
.defaultValue("/pfam").required(true)
|
||||
descriptor = cachePath :: descriptor
|
||||
descriptor
|
||||
}
|
||||
|
||||
|
|
|
@ -4,7 +4,7 @@ import java.io._
|
|||
|
||||
import cn.piflow.bundle.microorganism.util.{CustomIOTools, Process}
|
||||
import cn.piflow.conf.bean.PropertyDescriptor
|
||||
import cn.piflow.conf.util.ImageUtil
|
||||
import cn.piflow.conf.util.{ImageUtil, MapUtil}
|
||||
import cn.piflow.conf.{ConfigurableStop, PortEnum, StopGroup}
|
||||
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
|
||||
import org.apache.hadoop.conf.Configuration
|
||||
|
@ -19,15 +19,13 @@ class RefseqData extends ConfigurableStop{
|
|||
override val inportList: List[String] =List(PortEnum.DefaultPort.toString)
|
||||
override val outportList: List[String] = List(PortEnum.DefaultPort.toString)
|
||||
|
||||
|
||||
|
||||
var cachePath:String = _
|
||||
override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
|
||||
|
||||
val session = pec.get[SparkSession]()
|
||||
|
||||
val inDf: DataFrame = in.read()
|
||||
val configuration: Configuration = new Configuration()
|
||||
|
||||
val configuration: Configuration = new Configuration()
|
||||
var pathStr: String =inDf.take(1)(0).get(0).asInstanceOf[String]
|
||||
val pathARR: Array[String] = pathStr.split("\\/")
|
||||
var hdfsUrl:String=""
|
||||
|
@ -37,88 +35,62 @@ class RefseqData extends ConfigurableStop{
|
|||
configuration.set("fs.defaultFS",hdfsUrl)
|
||||
var fs: FileSystem = FileSystem.get(configuration)
|
||||
|
||||
val hdfsPathTemporary:String = hdfsUrl+"/Refseq_genomeParser_temporary.json"
|
||||
val path: Path = new Path(hdfsPathTemporary)
|
||||
val hdfsPathTemporary = hdfsUrl+cachePath+"/refseqCache/refseqCache.json"
|
||||
|
||||
val path: Path = new Path(hdfsPathTemporary)
|
||||
if(fs.exists(path)){
|
||||
fs.delete(path)
|
||||
}
|
||||
|
||||
fs.create(path).close()
|
||||
var fdos: FSDataOutputStream = fs.append(path)
|
||||
val buff: Array[Byte] = new Array[Byte](1048576)
|
||||
|
||||
var jsonStr: String =""
|
||||
|
||||
var bis: BufferedInputStream =null
|
||||
val hdfsWriter: OutputStreamWriter = new OutputStreamWriter(fs.append(path))
|
||||
|
||||
var doc: JSONObject= null
|
||||
var seq: RichSequence = null
|
||||
var br: BufferedReader = null
|
||||
var fdis: FSDataInputStream =null
|
||||
var sequences: RichSequenceIterator = null
|
||||
var count : Int = 0
|
||||
inDf.collect().foreach(row => {
|
||||
|
||||
var n : Int =0
|
||||
pathStr = row.get(0).asInstanceOf[String]
|
||||
|
||||
fdis = fs.open(new Path(pathStr))
|
||||
br = new BufferedReader(new InputStreamReader(fdis))
|
||||
sequences = CustomIOTools.IOTools.readGenbankProtein(br, null)
|
||||
|
||||
var fdis: FSDataInputStream = fs.open(new Path(pathStr))
|
||||
while (sequences.hasNext) {
|
||||
count += 1
|
||||
seq = sequences.nextRichSequence()
|
||||
doc = new JSONObject
|
||||
Process.processSingleSequence(seq, doc)
|
||||
|
||||
var br: BufferedReader = new BufferedReader(new InputStreamReader(fdis))
|
||||
|
||||
var sequences: RichSequenceIterator = CustomIOTools.IOTools.readGenbankProtein(br, null)
|
||||
|
||||
while (sequences.hasNext) {
|
||||
n += 1
|
||||
var seq: RichSequence = sequences.nextRichSequence()
|
||||
var doc: JSONObject = new JSONObject
|
||||
Process.processSingleSequence(seq, doc)
|
||||
jsonStr = doc.toString
|
||||
println("start " + n)
|
||||
|
||||
if (n == 1) {
|
||||
bis = new BufferedInputStream(new ByteArrayInputStream(("[" + jsonStr).getBytes()))
|
||||
} else {
|
||||
bis = new BufferedInputStream(new ByteArrayInputStream(("," + jsonStr).getBytes()))
|
||||
doc.write(hdfsWriter)
|
||||
hdfsWriter.write("\n")
|
||||
}
|
||||
|
||||
var count: Int = bis.read(buff)
|
||||
while (count != -1) {
|
||||
fdos.write(buff, 0, count)
|
||||
fdos.flush()
|
||||
count = bis.read(buff)
|
||||
}
|
||||
fdos.flush()
|
||||
bis = null
|
||||
seq = null
|
||||
doc = null
|
||||
}
|
||||
fdis.close()
|
||||
br.close()
|
||||
|
||||
})
|
||||
bis = new BufferedInputStream(new ByteArrayInputStream(("]").getBytes()))
|
||||
|
||||
var count: Int = bis.read(buff)
|
||||
while (count != -1) {
|
||||
fdos.write(buff, 0, count)
|
||||
fdos.flush()
|
||||
count = bis.read(buff)
|
||||
}
|
||||
fdos.flush()
|
||||
bis.close()
|
||||
fdos.close()
|
||||
|
||||
hdfsWriter.close()
|
||||
println("start parser HDFSjsonFile")
|
||||
val df: DataFrame = session.read.json(hdfsPathTemporary)
|
||||
|
||||
out.write(df)
|
||||
|
||||
|
||||
}
|
||||
|
||||
override def setProperties(map: Map[String, Any]): Unit = {
|
||||
|
||||
def setProperties(map: Map[String, Any]): Unit = {
|
||||
cachePath=MapUtil.get(map,key="cachePath").asInstanceOf[String]
|
||||
}
|
||||
override def getPropertyDescriptor(): List[PropertyDescriptor] = {
|
||||
var descriptor : List[PropertyDescriptor] = List()
|
||||
val cachePath = new PropertyDescriptor().name("cachePath").displayName("cachePath").description("Temporary Cache File Path")
|
||||
.defaultValue("/refseqData").required(true)
|
||||
descriptor = cachePath :: descriptor
|
||||
descriptor
|
||||
}
|
||||
|
||||
override def getPropertyDescriptor(): List[PropertyDescriptor] ={
|
||||
var descriptor : List[PropertyDescriptor] = List()
|
||||
descriptor
|
||||
}
|
||||
|
||||
override def getIcon(): Array[Byte] = {
|
||||
ImageUtil.getImage("icon/microorganism/RefseqData.png")
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
package cn.piflow.bundle.microorganism
|
||||
|
||||
import java.io.{BufferedInputStream, BufferedReader, ByteArrayInputStream, InputStreamReader}
|
||||
import java.io._
|
||||
|
||||
import cn.piflow.bundle.microorganism.util.{CustomIOTools, Process}
|
||||
import cn.piflow.conf.bean.PropertyDescriptor
|
||||
import cn.piflow.conf.util.ImageUtil
|
||||
import cn.piflow.conf.util.{ImageUtil, MapUtil}
|
||||
import cn.piflow.conf.{ConfigurableStop, PortEnum, StopGroup}
|
||||
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
|
||||
import org.apache.hadoop.conf.Configuration
|
||||
|
@ -19,12 +19,16 @@ class SwissprotData extends ConfigurableStop{
|
|||
override val inportList: List[String] =List(PortEnum.DefaultPort.toString)
|
||||
override val outportList: List[String] = List(PortEnum.DefaultPort.toString)
|
||||
|
||||
override def setProperties(map: Map[String, Any]): Unit = {
|
||||
var cachePath:String = _
|
||||
|
||||
def setProperties(map: Map[String, Any]): Unit = {
|
||||
cachePath=MapUtil.get(map,key="cachePath").asInstanceOf[String]
|
||||
}
|
||||
|
||||
override def getPropertyDescriptor(): List[PropertyDescriptor] ={
|
||||
override def getPropertyDescriptor(): List[PropertyDescriptor] = {
|
||||
var descriptor : List[PropertyDescriptor] = List()
|
||||
val cachePath = new PropertyDescriptor().name("cachePath").displayName("cachePath").description("Temporary Cache File Path")
|
||||
.defaultValue("/swissprot").required(true)
|
||||
descriptor = cachePath :: descriptor
|
||||
descriptor
|
||||
}
|
||||
|
||||
|
@ -41,96 +45,58 @@ class SwissprotData extends ConfigurableStop{
|
|||
}
|
||||
|
||||
override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
|
||||
|
||||
val session = pec.get[SparkSession]()
|
||||
|
||||
val inDf: DataFrame = in.read()
|
||||
|
||||
val configuration: Configuration = new Configuration()
|
||||
var pathStr: String = ""
|
||||
var pathStr: String =inDf.take(1)(0).get(0).asInstanceOf[String]
|
||||
val pathARR: Array[String] = pathStr.split("\\/")
|
||||
var hdfsUrl:String=""
|
||||
try{
|
||||
pathStr =inDf.take(1)(0).get(0).asInstanceOf[String]
|
||||
val pathARR: Array[String] = pathStr.split("\\/")
|
||||
|
||||
for (x <- (0 until 3)){
|
||||
hdfsUrl+=(pathARR(x) +"/")
|
||||
}
|
||||
}catch {
|
||||
case e:Exception => throw new Exception("Path error")
|
||||
for (x <- (0 until 3)){
|
||||
hdfsUrl+=(pathARR(x) +"/")
|
||||
}
|
||||
|
||||
configuration.set("fs.defaultFS",hdfsUrl)
|
||||
var fs: FileSystem = FileSystem.get(configuration)
|
||||
|
||||
val hdfsPathTemporary:String = hdfsUrl+"/Refseq_genomeParser_temporary.json"
|
||||
val path: Path = new Path(hdfsPathTemporary)
|
||||
val hdfsPathTemporary = hdfsUrl+cachePath+"/swissprotCache/swissprotCache.json"
|
||||
|
||||
val path: Path = new Path(hdfsPathTemporary)
|
||||
if(fs.exists(path)){
|
||||
fs.delete(path)
|
||||
}
|
||||
|
||||
fs.create(path).close()
|
||||
var fdos: FSDataOutputStream = fs.append(path)
|
||||
val buff: Array[Byte] = new Array[Byte](1048576)
|
||||
|
||||
var bis: BufferedInputStream =null
|
||||
val hdfsWriter: OutputStreamWriter = new OutputStreamWriter(fs.append(path))
|
||||
|
||||
var fdis: FSDataInputStream =null
|
||||
var br: BufferedReader = null
|
||||
var sequences: RichSequenceIterator = null
|
||||
var doc: JSONObject = null
|
||||
var seq: RichSequence = null
|
||||
var jsonStr: String = ""
|
||||
var n:Int=0
|
||||
var count:Int=0
|
||||
inDf.collect().foreach(row => {
|
||||
pathStr = row.get(0).asInstanceOf[String]
|
||||
fdis = fs.open(new Path(pathStr))
|
||||
br = new BufferedReader(new InputStreamReader(fdis))
|
||||
sequences = CustomIOTools.IOTools.readUniProt(br,null)
|
||||
|
||||
while (sequences.hasNext) {
|
||||
count += 1
|
||||
if (count < 100) {
|
||||
doc = new JSONObject()
|
||||
seq = sequences.nextRichSequence()
|
||||
Process.processUniprotSeq(seq, doc)
|
||||
|
||||
n += 1
|
||||
doc.write(hdfsWriter)
|
||||
hdfsWriter.write("\n")
|
||||
|
||||
doc = new JSONObject()
|
||||
seq = sequences.nextRichSequence()
|
||||
Process.processUniprotSeq(seq,doc)
|
||||
jsonStr = doc.toString
|
||||
println("start " + n + "String\\\n" )
|
||||
|
||||
if (n == 1) {
|
||||
bis = new BufferedInputStream(new ByteArrayInputStream(("[" + jsonStr).getBytes()))
|
||||
} else {
|
||||
bis = new BufferedInputStream(new ByteArrayInputStream(("," + jsonStr).getBytes()))
|
||||
seq = null
|
||||
}
|
||||
}
|
||||
var count: Int = bis.read(buff)
|
||||
while (count != -1) {
|
||||
fdos.write(buff, 0, count)
|
||||
fdos.flush()
|
||||
count = bis.read(buff)
|
||||
}
|
||||
fdos.flush()
|
||||
br.close()
|
||||
fdis.close()
|
||||
|
||||
bis = null
|
||||
doc = null
|
||||
seq = null
|
||||
jsonStr = ""
|
||||
}
|
||||
sequences = null
|
||||
br = null
|
||||
fdis =null
|
||||
pathStr = null
|
||||
})
|
||||
bis = new BufferedInputStream(new ByteArrayInputStream(("]").getBytes()))
|
||||
|
||||
var count: Int = bis.read(buff)
|
||||
while (count != -1) {
|
||||
fdos.write(buff, 0, count)
|
||||
fdos.flush()
|
||||
count = bis.read(buff)
|
||||
}
|
||||
fdos.flush()
|
||||
bis.close()
|
||||
fdos.close()
|
||||
hdfsWriter.close()
|
||||
|
||||
val df: DataFrame = session.read.json(hdfsPathTemporary)
|
||||
|
||||
|
|
|
@ -13,6 +13,7 @@ class FlowBean {
|
|||
var checkpoint : String = _
|
||||
var checkpointParentProcessId : String = _
|
||||
var runMode : String = _
|
||||
var showData : String = _
|
||||
var stops : List[StopBean] = List()
|
||||
var paths : List[PathBean] = List()
|
||||
|
||||
|
@ -25,6 +26,7 @@ class FlowBean {
|
|||
this.checkpoint = flowMap.getOrElse("checkpoint","").asInstanceOf[String]
|
||||
this.checkpointParentProcessId = flowMap.getOrElse("checkpointParentProcessId", "").asInstanceOf[String]
|
||||
this.runMode = flowMap.getOrElse("runMode","RUN").asInstanceOf[String]
|
||||
this.showData = flowMap.getOrElse("showData","0").asInstanceOf[String]
|
||||
|
||||
//construct StopBean List
|
||||
val stopsList = MapUtil.get(flowMap,"stops").asInstanceOf[List[Map[String, Any]]]
|
||||
|
|
|
@ -378,13 +378,12 @@ class JobOutputStreamImpl() extends JobOutputStream with Logging {
|
|||
|
||||
def getDataFrame(port: String) = mapDataFrame(port);
|
||||
|
||||
def showData() = {
|
||||
def showData(count:Int) = {
|
||||
|
||||
mapDataFrame.foreach(en => {
|
||||
val portName = if(en._1.equals("")) "default" else en._1
|
||||
println(portName + " port: ")
|
||||
|
||||
en._2.apply().show(PropertyUtil.getPropertyValue("data.show").toInt)
|
||||
en._2.apply().show(count)
|
||||
|
||||
})
|
||||
}
|
||||
|
@ -484,7 +483,10 @@ class ProcessImpl(flow: Flow, runnerContext: Context, runner: Runner, parentProc
|
|||
runnerListener.onJobCompleted(pe.getContext());
|
||||
|
||||
//show data in log
|
||||
outputs.showData()
|
||||
val showDataCount = PropertyUtil.getPropertyValue("data.show").toInt
|
||||
if(showDataCount > 0) {
|
||||
outputs.showData(showDataCount)
|
||||
}
|
||||
}
|
||||
catch {
|
||||
case e: Throwable =>
|
||||
|
@ -528,7 +530,10 @@ class ProcessImpl(flow: Flow, runnerContext: Context, runner: Runner, parentProc
|
|||
}
|
||||
}
|
||||
//show data in log
|
||||
outputs.showData()
|
||||
val showDataCount = PropertyUtil.getPropertyValue("data.show").toInt
|
||||
if(showDataCount > 0) {
|
||||
outputs.showData(showDataCount)
|
||||
}
|
||||
|
||||
//save data in debug mode
|
||||
if(flow.getRunMode() == FlowRunMode.DEBUG) {
|
||||
|
|
|
@ -10,6 +10,12 @@
|
|||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<artifactId>piflow-server</artifactId>
|
||||
|
||||
<properties>
|
||||
<akka.version>2.4.14</akka.version>
|
||||
<akka.http.version>10.0.0</akka.http.version>
|
||||
</properties>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>piflow</groupId>
|
||||
|
@ -30,40 +36,31 @@
|
|||
<dependency>
|
||||
<groupId>com.typesafe.akka</groupId>
|
||||
<artifactId>akka-remote_2.11</artifactId>
|
||||
<version>2.3.14</version>
|
||||
<version>${akka.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.typesafe.akka</groupId>
|
||||
<artifactId>akka-actor_2.11</artifactId>
|
||||
<version>2.3.14</version>
|
||||
<version>${akka.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.typesafe.akka</groupId>
|
||||
<artifactId>akka-slf4j_2.11</artifactId>
|
||||
<version>2.3.14</version>
|
||||
<version>${akka.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.typesafe.akka</groupId>
|
||||
<artifactId>akka-stream-experimental_2.11</artifactId>
|
||||
<version>2.0.4</version>
|
||||
<artifactId>akka-http_2.11</artifactId>
|
||||
<version>${akka.http.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.typesafe.akka</groupId>
|
||||
<artifactId>akka-http-core-experimental_2.11</artifactId>
|
||||
<version>2.0.4</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.typesafe.akka</groupId>
|
||||
<artifactId>akka-http-experimental_2.11</artifactId>
|
||||
<version>2.0.4</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.typesafe.akka</groupId>
|
||||
<artifactId>akka-http-spray-json-experimental_2.11</artifactId>
|
||||
<version>2.0.4</version>
|
||||
<artifactId>akka-http-spray-json_2.11</artifactId>
|
||||
<version>${akka.http.version}</version>
|
||||
</dependency>
|
||||
|
||||
|
||||
</dependencies>
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,10 @@
|
|||
PiFlowHTTPService{
|
||||
shutdown-timeout:300s
|
||||
}
|
||||
|
||||
akka {
|
||||
http {
|
||||
idle-timeout = 600 s
|
||||
request-timeout = 200 s
|
||||
}
|
||||
}
|
|
@ -19,7 +19,8 @@ import spray.json.DefaultJsonProtocol
|
|||
|
||||
|
||||
object HTTPService extends DefaultJsonProtocol with Directives with SprayJsonSupport{
|
||||
implicit val system = ActorSystem("HTTPService", ConfigFactory.load())
|
||||
implicit val config = ConfigFactory.load()
|
||||
implicit val system = ActorSystem("PiFlowHTTPService", config)
|
||||
implicit val materializer = ActorMaterializer()
|
||||
implicit val executionContext = system.dispatcher
|
||||
var processMap = Map[String, SparkAppHandle]()
|
||||
|
|