forked from opensci/piflow
update join stop
This commit is contained in:
parent
7572fe96bc
commit
c58a82a215
|
@ -9,7 +9,7 @@ import org.apache.spark.sql.{Column, DataFrame}
|
||||||
class Join extends ConfigurableStop{
|
class Join extends ConfigurableStop{
|
||||||
override val authorEmail: String = "yangqidong@cnic.cn"
|
override val authorEmail: String = "yangqidong@cnic.cn"
|
||||||
override val description: String = "Table connection, including full connection, left connection, right connection and inner connection"
|
override val description: String = "Table connection, including full connection, left connection, right connection and inner connection"
|
||||||
override val inportList: List[String] =List(PortEnum.AnyPort.toString)
|
override val inportList: List[String] =List(PortEnum.LeftPort.toString,PortEnum.RightPort.toString)
|
||||||
override val outportList: List[String] = List(PortEnum.DefaultPort.toString)
|
override val outportList: List[String] = List(PortEnum.DefaultPort.toString)
|
||||||
|
|
||||||
var joinMode:String=_
|
var joinMode:String=_
|
||||||
|
@ -19,31 +19,20 @@ class Join extends ConfigurableStop{
|
||||||
|
|
||||||
override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
|
override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
|
||||||
|
|
||||||
val dfs: Seq[DataFrame] = in.ports().map(in.read(_))
|
val leftDF = in.read(PortEnum.LeftPort)
|
||||||
var df1: DataFrame = dfs(0)
|
val rightDF = in.read(PortEnum.RightPort)
|
||||||
var df2: DataFrame = dfs(1)
|
|
||||||
var column: Column = null
|
|
||||||
|
|
||||||
val correlationFieldArr: Array[String] = correlationField.split(",")
|
var seq: Seq[String]= Seq()
|
||||||
df1 = df1.withColumnRenamed(correlationFieldArr(0),correlationFieldArr(0)+"_1")
|
correlationField.split(",").foreach(x=>{
|
||||||
df2 = df2.withColumnRenamed(correlationFieldArr(0),correlationFieldArr(0)+"_2")
|
seq = seq .++(Seq(x.toString))
|
||||||
column = df1(correlationFieldArr(0)+"_1")===df2(correlationFieldArr(0)+"_2")
|
})
|
||||||
if(correlationFieldArr.size > 1){
|
|
||||||
for(x <- (1 until correlationFieldArr.size)){
|
|
||||||
var newColumn: Column =null
|
|
||||||
df1 = df1.withColumnRenamed(correlationFieldArr(x),correlationFieldArr(x)+"_1")
|
|
||||||
df2 = df2.withColumnRenamed(correlationFieldArr(x),correlationFieldArr(x)+"_2")
|
|
||||||
newColumn = df1(correlationFieldArr(x)+"_1")===df2(correlationFieldArr(x)+"_2")
|
|
||||||
column = column and newColumn
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
var df: DataFrame = null
|
var df: DataFrame = null
|
||||||
joinMode match {
|
joinMode match {
|
||||||
case "INNER" =>df = df1.join(df2, column)
|
case "inner" =>df = leftDF.join(rightDF, seq)
|
||||||
case "LEFT" => df = df1.join(df2,column,"left_outer")
|
case "left" => df = leftDF.join(rightDF,seq,"left_outer")
|
||||||
case "RIGHT" => df = df1.join(df2,column,"right_outer")
|
case "right" => df = leftDF.join(rightDF,seq,"right_outer")
|
||||||
case "FULL" => df = df1.join(df2,column,"outer")
|
case "full_outer" => df = leftDF.join(rightDF,seq,"outer")
|
||||||
}
|
}
|
||||||
out.write(df)
|
out.write(df)
|
||||||
|
|
||||||
|
@ -59,8 +48,9 @@ class Join extends ConfigurableStop{
|
||||||
override def getPropertyDescriptor(): List[PropertyDescriptor] = {
|
override def getPropertyDescriptor(): List[PropertyDescriptor] = {
|
||||||
var descriptor : List[PropertyDescriptor] = List()
|
var descriptor : List[PropertyDescriptor] = List()
|
||||||
|
|
||||||
val joinMode = new PropertyDescriptor().name("joinMode").displayName("joinMode").description("For table association, you can choose INNER, LEFT, RIGHT, FULL").defaultValue("").required(true)
|
val joinMode = new PropertyDescriptor().name("joinMode").displayName("joinMode").description("For table association, you can choose INNER, LEFT, RIGHT, FULL")
|
||||||
val correlationField = new PropertyDescriptor().name("correlationField").displayName("correlationField").description("Fields associated with tables,If there are more than one, please use, separate").defaultValue("").required(true)
|
.allowableValues(Set("inner","left","right","full_outer")).defaultValue("inner").required(true)
|
||||||
|
val correlationField = new PropertyDescriptor().name("correlationField").displayName("correlationField").description("Fields associated with tables,If there are more than one, please use , separate").defaultValue("").required(true)
|
||||||
descriptor = correlationField :: descriptor
|
descriptor = correlationField :: descriptor
|
||||||
descriptor = joinMode :: descriptor
|
descriptor = joinMode :: descriptor
|
||||||
|
|
||||||
|
|
|
@ -5,6 +5,8 @@ object PortEnum {
|
||||||
val AnyPort = "Any"
|
val AnyPort = "Any"
|
||||||
val DefaultPort = "Default"
|
val DefaultPort = "Default"
|
||||||
val NonePort = "None"
|
val NonePort = "None"
|
||||||
|
val LeftPort = "Left"
|
||||||
|
val RightPort= "Right"
|
||||||
val scopus_articlePort = "scopus_articlePort"
|
val scopus_articlePort = "scopus_articlePort"
|
||||||
val scopus_article_api_response = "scopus_article_api_response"
|
val scopus_article_api_response = "scopus_article_api_response"
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue