update join stop

This commit is contained in:
or 2019-09-02 10:50:55 +08:00
parent 7572fe96bc
commit c58a82a215
2 changed files with 16 additions and 24 deletions

View File

@ -9,7 +9,7 @@ import org.apache.spark.sql.{Column, DataFrame}
class Join extends ConfigurableStop{ class Join extends ConfigurableStop{
override val authorEmail: String = "yangqidong@cnic.cn" override val authorEmail: String = "yangqidong@cnic.cn"
override val description: String = "Table connection, including full connection, left connection, right connection and inner connection" override val description: String = "Table connection, including full connection, left connection, right connection and inner connection"
override val inportList: List[String] =List(PortEnum.AnyPort.toString) override val inportList: List[String] =List(PortEnum.LeftPort.toString,PortEnum.RightPort.toString)
override val outportList: List[String] = List(PortEnum.DefaultPort.toString) override val outportList: List[String] = List(PortEnum.DefaultPort.toString)
var joinMode:String=_ var joinMode:String=_
@ -19,31 +19,20 @@ class Join extends ConfigurableStop{
override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
val dfs: Seq[DataFrame] = in.ports().map(in.read(_)) val leftDF = in.read(PortEnum.LeftPort)
var df1: DataFrame = dfs(0) val rightDF = in.read(PortEnum.RightPort)
var df2: DataFrame = dfs(1)
var column: Column = null
val correlationFieldArr: Array[String] = correlationField.split(",") var seq: Seq[String]= Seq()
df1 = df1.withColumnRenamed(correlationFieldArr(0),correlationFieldArr(0)+"_1") correlationField.split(",").foreach(x=>{
df2 = df2.withColumnRenamed(correlationFieldArr(0),correlationFieldArr(0)+"_2") seq = seq .++(Seq(x.toString))
column = df1(correlationFieldArr(0)+"_1")===df2(correlationFieldArr(0)+"_2") })
if(correlationFieldArr.size > 1){
for(x <- (1 until correlationFieldArr.size)){
var newColumn: Column =null
df1 = df1.withColumnRenamed(correlationFieldArr(x),correlationFieldArr(x)+"_1")
df2 = df2.withColumnRenamed(correlationFieldArr(x),correlationFieldArr(x)+"_2")
newColumn = df1(correlationFieldArr(x)+"_1")===df2(correlationFieldArr(x)+"_2")
column = column and newColumn
}
}
var df: DataFrame = null var df: DataFrame = null
joinMode match { joinMode match {
case "INNER" =>df = df1.join(df2, column) case "inner" =>df = leftDF.join(rightDF, seq)
case "LEFT" => df = df1.join(df2,column,"left_outer") case "left" => df = leftDF.join(rightDF,seq,"left_outer")
case "RIGHT" => df = df1.join(df2,column,"right_outer") case "right" => df = leftDF.join(rightDF,seq,"right_outer")
case "FULL" => df = df1.join(df2,column,"outer") case "full_outer" => df = leftDF.join(rightDF,seq,"outer")
} }
out.write(df) out.write(df)
@ -59,8 +48,9 @@ class Join extends ConfigurableStop{
override def getPropertyDescriptor(): List[PropertyDescriptor] = { override def getPropertyDescriptor(): List[PropertyDescriptor] = {
var descriptor : List[PropertyDescriptor] = List() var descriptor : List[PropertyDescriptor] = List()
val joinMode = new PropertyDescriptor().name("joinMode").displayName("joinMode").description("For table association, you can choose INNER, LEFT, RIGHT, FULL").defaultValue("").required(true) val joinMode = new PropertyDescriptor().name("joinMode").displayName("joinMode").description("For table association, you can choose INNER, LEFT, RIGHT, FULL")
val correlationField = new PropertyDescriptor().name("correlationField").displayName("correlationField").description("Fields associated with tables,If there are more than one, please use, separate").defaultValue("").required(true) .allowableValues(Set("inner","left","right","full_outer")).defaultValue("inner").required(true)
val correlationField = new PropertyDescriptor().name("correlationField").displayName("correlationField").description("Fields associated with tables,If there are more than one, please use , separate").defaultValue("").required(true)
descriptor = correlationField :: descriptor descriptor = correlationField :: descriptor
descriptor = joinMode :: descriptor descriptor = joinMode :: descriptor

View File

@ -5,6 +5,8 @@ object PortEnum {
val AnyPort = "Any" val AnyPort = "Any"
val DefaultPort = "Default" val DefaultPort = "Default"
val NonePort = "None" val NonePort = "None"
val LeftPort = "Left"
val RightPort= "Right"
val scopus_articlePort = "scopus_articlePort" val scopus_articlePort = "scopus_articlePort"
val scopus_article_api_response = "scopus_article_api_response" val scopus_article_api_response = "scopus_article_api_response"
} }