Increased functionality for decompressing stop
QiDong Yang
This commit is contained in:
parent
a509a6d70a
commit
ceb2239755
|
@ -88,6 +88,8 @@ class LoadFromFtpToHDFS extends ConfigurableStop {
|
||||||
con = new FTPClientConfig(FTPClientConfig.SYST_NT)
|
con = new FTPClientConfig(FTPClientConfig.SYST_NT)
|
||||||
con.setServerLanguageCode("zh")
|
con.setServerLanguageCode("zh")
|
||||||
ftp.setFileType(FTP.BINARY_FILE_TYPE)
|
ftp.setFileType(FTP.BINARY_FILE_TYPE)
|
||||||
|
ftp.setDataTimeout(600000)
|
||||||
|
ftp.setConnectTimeout(600000)
|
||||||
ftp
|
ftp
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
package cn.piflow.bundle.http
|
package cn.piflow.bundle.http
|
||||||
|
|
||||||
|
import java.io._
|
||||||
import java.util.zip.GZIPInputStream
|
import java.util.zip.GZIPInputStream
|
||||||
|
|
||||||
import cn.piflow.conf._
|
import cn.piflow.conf._
|
||||||
|
@ -7,10 +8,11 @@ import cn.piflow.conf.bean.PropertyDescriptor
|
||||||
import cn.piflow.conf.util.{ImageUtil, MapUtil}
|
import cn.piflow.conf.util.{ImageUtil, MapUtil}
|
||||||
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
|
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
|
||||||
import org.apache.hadoop.conf.Configuration
|
import org.apache.hadoop.conf.Configuration
|
||||||
import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path}
|
import org.apache.hadoop.fs.{FSDataInputStream, FSDataOutputStream, FileSystem, Path}
|
||||||
import org.apache.spark.rdd.RDD
|
import org.apache.spark.rdd.RDD
|
||||||
import org.apache.spark.sql.types.{StringType, StructField, StructType}
|
import org.apache.spark.sql.types.{StringType, StructField, StructType}
|
||||||
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
|
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
|
||||||
|
import org.apache.tools.tar.{TarEntry, TarInputStream}
|
||||||
|
|
||||||
import scala.collection.mutable.ArrayBuffer
|
import scala.collection.mutable.ArrayBuffer
|
||||||
|
|
||||||
|
@ -21,55 +23,110 @@ class UnzipFilesOnHDFS extends ConfigurableStop {
|
||||||
val outportList: List[String] = List(PortEnum.DefaultPort.toString)
|
val outportList: List[String] = List(PortEnum.DefaultPort.toString)
|
||||||
|
|
||||||
var isCustomize:String=_
|
var isCustomize:String=_
|
||||||
|
var hdfsUrl:String=_
|
||||||
var filePath:String=_
|
var filePath:String=_
|
||||||
var fileType:String=_
|
var savePath:String=_
|
||||||
var unzipPath:String=_
|
|
||||||
|
|
||||||
|
|
||||||
var session: SparkSession = null
|
var session: SparkSession = null
|
||||||
|
var arr:ArrayBuffer[Row]=ArrayBuffer()
|
||||||
|
|
||||||
|
|
||||||
|
def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
|
||||||
|
|
||||||
|
session = pec.get[SparkSession]()
|
||||||
|
|
||||||
|
|
||||||
|
if(isCustomize.equals("true")){
|
||||||
|
|
||||||
|
unzipFile(filePath,savePath)
|
||||||
|
|
||||||
|
}else if (isCustomize .equals("false")){
|
||||||
|
|
||||||
|
val inDf: DataFrame = in.read()
|
||||||
|
inDf.collect().foreach(row => {
|
||||||
|
filePath = row.get(0).asInstanceOf[String]
|
||||||
|
unzipFile(filePath,savePath)
|
||||||
|
|
||||||
|
})
|
||||||
|
|
||||||
def unzipFile(hdfsFilePath: String, zipFileType: String, unzipHdfsPath: String):String = {
|
|
||||||
var zft: String = ""
|
|
||||||
if(zipFileType.length < 1){
|
|
||||||
zft = hdfsFilePath.split("\\.").last
|
|
||||||
}else{
|
|
||||||
zft = zipFileType
|
|
||||||
}
|
}
|
||||||
|
|
||||||
val configuration: Configuration = new Configuration()
|
val rdd: RDD[Row] = session.sparkContext.makeRDD(arr.toList)
|
||||||
val pathARR: Array[String] = hdfsFilePath.split("\\/")
|
val fields: Array[StructField] =Array(StructField("savePath",StringType,nullable = true))
|
||||||
var hdfsUrl:String=""
|
val schema: StructType = StructType(fields)
|
||||||
for (x <- (0 until 3)){
|
val df: DataFrame = session.createDataFrame(rdd,schema)
|
||||||
|
|
||||||
hdfsUrl+=(pathARR(x) +"/")
|
println("##################################################################################################")
|
||||||
}
|
// println(df.count())
|
||||||
configuration.set("fs.defaultFS",hdfsUrl)
|
df.show(20)
|
||||||
|
println("##################################################################################################")
|
||||||
|
|
||||||
var uhp : String=""
|
out.write(df)
|
||||||
if(unzipHdfsPath.length < 1){
|
|
||||||
for (x <- (0 until pathARR.length-1)){
|
}
|
||||||
uhp+=(pathARR(x) +"/")
|
|
||||||
|
|
||||||
|
def whatType(p:String): String = {
|
||||||
|
var typeStr:String=""
|
||||||
|
val pathNames: Array[String] = p.split("\\.")
|
||||||
|
val lastStr: String = pathNames.last
|
||||||
|
if(lastStr.equals("gz")){
|
||||||
|
val penultStr: String = pathNames(pathNames.length-2)
|
||||||
|
if(penultStr.equals("tar")){
|
||||||
|
typeStr="tar.gz"
|
||||||
|
}else {
|
||||||
|
typeStr="gz"
|
||||||
}
|
}
|
||||||
|
}else{
|
||||||
|
throw new RuntimeException("File type fill in error, or do not support this type.")
|
||||||
|
}
|
||||||
|
typeStr
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def getFs(fileHdfsPath: String): FileSystem = {
|
||||||
|
var configuration: Configuration = new Configuration()
|
||||||
|
var fs: FileSystem =null
|
||||||
|
if (isCustomize.equals("false")) {
|
||||||
|
val pathARR: Array[String] = fileHdfsPath.split("\\/")
|
||||||
|
hdfsUrl = ""
|
||||||
|
for (x <- (0 until 3)) {
|
||||||
|
hdfsUrl += (pathARR(x) + "/")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
configuration.set("fs.defaultFS", hdfsUrl)
|
||||||
|
fs = FileSystem.get(configuration)
|
||||||
|
fs
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def unzipFile(fileHdfsPath: String, saveHdfsPath: String)= {
|
||||||
|
var eachSavePath : String=""
|
||||||
|
|
||||||
|
var unType: String = whatType(fileHdfsPath)
|
||||||
|
var fileName: String = fileHdfsPath.split("\\/").last
|
||||||
|
var fs: FileSystem= getFs(fileHdfsPath)
|
||||||
|
|
||||||
|
var sp:String=""
|
||||||
|
if(saveHdfsPath.length < 1){
|
||||||
|
sp=fileHdfsPath.replace(fileName,"")
|
||||||
}else{
|
}else{
|
||||||
uhp=unzipHdfsPath
|
sp = saveHdfsPath
|
||||||
}
|
}
|
||||||
|
|
||||||
val fs = FileSystem.get(configuration)
|
val fdis: FSDataInputStream = fs.open(new Path(fileHdfsPath))
|
||||||
val fdis: FSDataInputStream = fs.open(new Path(hdfsFilePath))
|
|
||||||
val filePathArr: Array[String] = hdfsFilePath.split("/")
|
|
||||||
var fileName: String = filePathArr.last
|
|
||||||
if(fileName.length == 0){
|
|
||||||
fileName = filePathArr(filePathArr.size-2)
|
|
||||||
}
|
|
||||||
|
|
||||||
var savePath:String=""
|
|
||||||
|
|
||||||
if(zft.equals("gz")){
|
if(unType.equals("gz")){
|
||||||
val gzip: GZIPInputStream = new GZIPInputStream(fdis)
|
val gzip: GZIPInputStream = new GZIPInputStream(fdis)
|
||||||
var n = -1
|
var n = -1
|
||||||
val buf=new Array[Byte](10*1024*1024)
|
val buf=new Array[Byte](10*1024*1024)
|
||||||
savePath = uhp +fileName.replace(".gz","")
|
|
||||||
val path = new Path(savePath)
|
eachSavePath = sp +fileName.replace(".gz","")
|
||||||
|
arr += Row.fromSeq(Array(eachSavePath))
|
||||||
|
val path = new Path(eachSavePath)
|
||||||
val fdos = fs.create(path)
|
val fdos = fs.create(path)
|
||||||
while((n=gzip.read(buf)) != -1 && n != -1){
|
while((n=gzip.read(buf)) != -1 && n != -1){
|
||||||
fdos.write(buf,0,n)
|
fdos.write(buf,0,n)
|
||||||
|
@ -78,60 +135,45 @@ class UnzipFilesOnHDFS extends ConfigurableStop {
|
||||||
fdos.close()
|
fdos.close()
|
||||||
gzip.close()
|
gzip.close()
|
||||||
fdis.close()
|
fdis.close()
|
||||||
}else{
|
}else if(unType.equals("tar.gz")){
|
||||||
throw new RuntimeException("File type fill in error, or do not support this type.")
|
|
||||||
}
|
|
||||||
|
|
||||||
savePath
|
try {
|
||||||
|
val gzip = new GZIPInputStream(new BufferedInputStream(fdis))
|
||||||
|
val tarIn = new TarInputStream(gzip, 1024 * 2)
|
||||||
|
|
||||||
}
|
fs.create(new Path(sp)).close()
|
||||||
|
|
||||||
def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
|
var entry: TarEntry = null
|
||||||
|
|
||||||
session = pec.get[SparkSession]()
|
while ((entry = tarIn.getNextEntry) != null && entry !=null) {
|
||||||
|
|
||||||
var savePath: String = ""
|
if (entry.isDirectory()) {
|
||||||
var arr:ArrayBuffer[Row]=ArrayBuffer()
|
val outPath = sp + "/" + entry.getName
|
||||||
|
fs.create(new Path(outPath)).close()
|
||||||
|
|
||||||
|
} else {
|
||||||
|
val outPath = sp + "/" + entry.getName
|
||||||
|
|
||||||
if(isCustomize.equals("true")){
|
arr += Row.fromSeq(Array(outPath))
|
||||||
println("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
|
val fos: FSDataOutputStream = fs.create(new Path(outPath))
|
||||||
|
|
||||||
savePath = unzipFile(filePath,fileType,unzipPath)
|
var lenth = 0
|
||||||
|
val buff = new Array[Byte](1024)
|
||||||
|
while ((lenth = tarIn.read(buff)) != -1 && (lenth != -1)) {
|
||||||
println("savepath : "+savePath)
|
fos.write(buff, 0, lenth)
|
||||||
|
}
|
||||||
arr += Row.fromSeq(Array(savePath))
|
fos.close()
|
||||||
|
}
|
||||||
}else if (isCustomize.equals("false")){
|
}
|
||||||
|
}catch {
|
||||||
val inDf: DataFrame = in.read()
|
case e: IOException =>
|
||||||
inDf.collect().foreach(row => {
|
e.printStackTrace()
|
||||||
|
}
|
||||||
filePath = row.get(0).asInstanceOf[String]
|
|
||||||
savePath = unzipFile(filePath,"","")
|
|
||||||
arr += Row.fromSeq(Array(savePath))
|
|
||||||
savePath = ""
|
|
||||||
|
|
||||||
})
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
val rdd: RDD[Row] = session.sparkContext.makeRDD(arr.toList)
|
|
||||||
val fields: Array[StructField] =Array(StructField("unzipPath",StringType,nullable = true))
|
|
||||||
val schema: StructType = StructType(fields)
|
|
||||||
val df: DataFrame = session.createDataFrame(rdd,schema)
|
|
||||||
|
|
||||||
println("##################################################################################################")
|
|
||||||
// println(df.count())
|
|
||||||
df.show(20)
|
|
||||||
println("##################################################################################################")
|
|
||||||
|
|
||||||
out.write(df)
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def initialize(ctx: ProcessContext): Unit = {
|
def initialize(ctx: ProcessContext): Unit = {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -139,25 +181,24 @@ class UnzipFilesOnHDFS extends ConfigurableStop {
|
||||||
def setProperties(map : Map[String, Any]) = {
|
def setProperties(map : Map[String, Any]) = {
|
||||||
isCustomize=MapUtil.get(map,key="isCustomize").asInstanceOf[String]
|
isCustomize=MapUtil.get(map,key="isCustomize").asInstanceOf[String]
|
||||||
filePath=MapUtil.get(map,key="filePath").asInstanceOf[String]
|
filePath=MapUtil.get(map,key="filePath").asInstanceOf[String]
|
||||||
fileType=MapUtil.get(map,key="fileType").asInstanceOf[String]
|
hdfsUrl=MapUtil.get(map,key="hdfsUrl").asInstanceOf[String]
|
||||||
unzipPath=MapUtil.get(map,key="unzipPath").asInstanceOf[String]
|
savePath=MapUtil.get(map,key="savePath").asInstanceOf[String]
|
||||||
}
|
}
|
||||||
|
|
||||||
override def getPropertyDescriptor(): List[PropertyDescriptor] = {
|
override def getPropertyDescriptor(): List[PropertyDescriptor] = {
|
||||||
var descriptor : List[PropertyDescriptor] = List()
|
var descriptor : List[PropertyDescriptor] = List()
|
||||||
|
|
||||||
val filePath = new PropertyDescriptor().name("filePath").displayName("filePath").description("file path,such as hdfs://10.0.86.89:9000/a/a.gz").defaultValue("").required(false)
|
val filePath = new PropertyDescriptor().name("filePath").displayName("filePath").description("file path,such as /a/a.gz").defaultValue("").required(false)
|
||||||
val fileType = new PropertyDescriptor().name("fileType").displayName("fileType").description("file type,such as gz").defaultValue("").required(false)
|
val hdfsUrl = new PropertyDescriptor().name("hdfsUrl").displayName("hdfsUrl").description("the url of HDFS,such as hdfs://10.0.86.89:9000").defaultValue("").required(false)
|
||||||
val unzipPath = new PropertyDescriptor().name("unzipPath").displayName("unzipPath").description("unzip path, such as hdfs://10.0.86.89:9000/b/").defaultValue("").required(true)
|
val savePath = new PropertyDescriptor().name("savePath").displayName("savePath").description("unzip dir path, such as /b/").defaultValue("").required(true)
|
||||||
val isCustomize = new PropertyDescriptor().name("isCustomize").displayName("isCustomize").description("Whether to customize the compressed file path, if true, " +
|
val isCustomize = new PropertyDescriptor().name("isCustomize").displayName("isCustomize").description("Whether to customize the compressed file path, if true, " +
|
||||||
"you must specify the path where the compressed file is located and the saved path after decompression. " +
|
"you must specify the path where the compressed file is located . " +
|
||||||
"If it is fals, it will automatically find the file path data from the upstream port and " +
|
"If it is fals, it will automatically find the file path data from the upstream port ")
|
||||||
"save it to the original folder after decompression.")
|
|
||||||
.defaultValue("").required(false)
|
.defaultValue("").required(false)
|
||||||
descriptor = isCustomize :: descriptor
|
descriptor = isCustomize :: descriptor
|
||||||
descriptor = filePath :: descriptor
|
descriptor = filePath :: descriptor
|
||||||
descriptor = fileType :: descriptor
|
descriptor = hdfsUrl :: descriptor
|
||||||
descriptor = unzipPath :: descriptor
|
descriptor = savePath :: descriptor
|
||||||
|
|
||||||
descriptor
|
descriptor
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue