add decision tree classification stops

2018-10-17 15:47:25 +08:00 · 2018-10-17 15:47:25 +08:00 · b878944846
parent 5efe1a1745
commit b878944846
4 changed files with 222 additions and 3 deletions
--- a/piflow-bundle/src/main/resources/decisiontree.json
+++ b/piflow-bundle/src/main/resources/decisiontree.json
@ -0,0 +1,42 @@
 {
  "flow":{
    "name":"test",
    "uuid":"1234",
    "stops":[
      {
        "uuid":"0000",
        "name":"DecisionTreeTraining",
        "bundle":"cn.piflow.bundle.ml_classification.DecisionTreeTraining",
        "properties":{
          "training_data_path":"hdfs://10.0.86.89:9000/xx/watermellonDataset.txt",
          "model_save_path":"hdfs://10.0.86.89:9000/xx/naivebayes/dt.model",
          "maxBins":"20",
          "maxDepth":"10",
          "minInfoGain":"0.1",
          "minInstancesPerNode":"2",
          "impurity":"entropy"
        }
      },
      {
        "uuid":"1111",
        "name":"DecisionTreePrediction",
        "bundle":"cn.piflow.bundle.ml_classification.DecisionTreePrediction",
        "properties":{
          "test_data_path":"hdfs://10.0.86.89:9000/xx/watermellonDataset.txt",
          "model_path":"hdfs://10.0.86.89:9000/xx/naivebayes/dt.model"
        }
      }
    ],
    "paths":[
      {
        "from":"DecisionTreeTraining",
        "outport":"",
        "inport":"",
        "to":"DecisionTreePrediction"
      }
    ]
  }
 }
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_classification/DecisionTreePrediction.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_classification/DecisionTreePrediction.scala
@ -0,0 +1,58 @@
 package cn.piflow.bundle.ml_classification
 import cn.piflow.conf.bean.PropertyDescriptor
 import cn.piflow.conf.util.MapUtil
 import cn.piflow.conf.{ConfigurableStop, StopGroupEnum}
 import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
 import org.apache.spark.ml.classification.DecisionTreeClassificationModel
 import org.apache.spark.sql.SparkSession
 class DecisionTreePrediction extends ConfigurableStop{
  val authorEmail: String = "xiaoxiao@cnic.cn"
  val description: String = "Make use of a exist DecisionTreeModel to predict."
  val inportCount: Int = 1
  val outportCount: Int = 0
  var test_data_path:String =_
  var model_path:String=_
  def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
    val spark = pec.get[SparkSession]()
    //load data stored in libsvm format as a dataframe
    val data=spark.read.format("libsvm").load(test_data_path)
    //data.show()
    //load model
    val model=DecisionTreeClassificationModel.load(model_path)
    val predictions=model.transform(data)
    predictions.show()
    out.write(predictions)
  }
  def initialize(ctx: ProcessContext): Unit = {
  }
  def setProperties(map: Map[String, Any]): Unit = {
    test_data_path=MapUtil.get(map,key="test_data_path").asInstanceOf[String]
    model_path=MapUtil.get(map,key="model_path").asInstanceOf[String]
  }
  override def getPropertyDescriptor(): List[PropertyDescriptor] = {
    var descriptor : List[PropertyDescriptor] = List()
    val test_data_path = new PropertyDescriptor().name("test_data_path").displayName("TEST_DATA_PATH").defaultValue("").required(true)
    val model_path = new PropertyDescriptor().name("model_path").displayName("MODEL_PATH").defaultValue("").required(true)
    descriptor = test_data_path :: descriptor
    descriptor = model_path :: descriptor
    descriptor
  }
  override def getIcon(): Array[Byte] = ???
  override def getGroup(): List[String] = {
    List(StopGroupEnum.MLGroup.toString)
  }
 }
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_classification/DecisionTreeTraining.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_classification/DecisionTreeTraining.scala
@ -0,0 +1,119 @@
 package cn.piflow.bundle.ml_classification
 import cn.piflow.conf.bean.PropertyDescriptor
 import cn.piflow.conf.util.MapUtil
 import cn.piflow.conf.{ConfigurableStop, StopGroupEnum}
 import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
 import org.apache.spark.ml.classification.DecisionTreeClassifier
 import org.apache.spark.sql.SparkSession
 class DecisionTreeTraining extends ConfigurableStop{
  val authorEmail: String = "xiaoxiao@cnic.cn"
  val description: String = "Training a DecisionTreeModel."
  val inportCount: Int = 1
  val outportCount: Int = 0
  var training_data_path:String =_
  var model_save_path:String=_
  var maxBins:String=_
  var maxDepth:String=_
  var minInfoGain:String=_
  var minInstancesPerNode:String=_
  var impurity:String=_
  def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
    val spark = pec.get[SparkSession]()
    //load data stored in libsvm format as a dataframe
    val data=spark.read.format("libsvm").load(training_data_path)
    //Maximum number of bins used for discretizing continuous features and for choosing how to split on features at each node. More bins give higher granularity.Must be >= 2 and >= number of categories in any categorical feature.
    var maxBinsValue:Int=40
    if(maxBins!=""){
      maxBinsValue=maxBins.toInt
    }
    //Maximum depth of the tree (>= 0).The maximum is 30.
    var maxDepthValue:Int=30
    if(maxDepth!=""){
      maxDepthValue=maxDepth.toInt
    }
    //Minimum information gain for a split to be considered at a tree node.
    var minInfoGainValue:Double=0.2
    if(minInfoGain!=""){
      minInfoGainValue=minInfoGain.toDouble
    }
    //Minimum number of instances each child must have after split.
    var minInstancesPerNodeValue:Int=3
    if(minInstancesPerNode!=""){
      minInstancesPerNodeValue=minInstancesPerNode.toInt
    }
    //Param for the name of family which is a description of the label distribution to be used in the model
    var impurityValue="gini"
    if(impurity!=""){
      impurityValue=impurity
    }
    //training a Logistic Regression model
    val model=new DecisionTreeClassifier()
      .setMaxBins(maxBinsValue)
      .setMaxDepth(maxDepthValue)
      .setMinInfoGain(minInfoGainValue)
      .setMinInstancesPerNode(minInstancesPerNodeValue)
      .setImpurity(impurityValue)
      .fit(data)
    //model persistence
    model.save(model_save_path)
    import spark.implicits._
    val dfOut=Seq(model_save_path).toDF
    dfOut.show()
    out.write(dfOut)
  }
  def initialize(ctx: ProcessContext): Unit = {
  }
  def setProperties(map: Map[String, Any]): Unit = {
    training_data_path=MapUtil.get(map,key="training_data_path").asInstanceOf[String]
    model_save_path=MapUtil.get(map,key="model_save_path").asInstanceOf[String]
    maxBins=MapUtil.get(map,key="maxBins").asInstanceOf[String]
    maxDepth=MapUtil.get(map,key="maxDepth").asInstanceOf[String]
    minInfoGain=MapUtil.get(map,key="minInfoGain").asInstanceOf[String]
    minInstancesPerNode=MapUtil.get(map,key="minInstancesPerNode").asInstanceOf[String]
    impurity=MapUtil.get(map,key="impurity").asInstanceOf[String]
  }
  override def getPropertyDescriptor(): List[PropertyDescriptor] = {
    var descriptor : List[PropertyDescriptor] = List()
    val training_data_path = new PropertyDescriptor().name("training_data_path").displayName("TRAINING_DATA_PATH").defaultValue("").required(true)
    val model_save_path = new PropertyDescriptor().name("model_save_path").displayName("MODEL_SAVE_PATH").description("ddd").defaultValue("").required(true)
    val maxBins=new PropertyDescriptor().name("maxBins").displayName("MAX_BINS").description("ddd").defaultValue("").required(true)
    val maxDepth=new PropertyDescriptor().name("maxDepth").displayName("MAX_DEPTH").description("ddd").defaultValue("").required(true)
    val minInfoGain=new PropertyDescriptor().name("minInfoGain").displayName("MIN_INFO_GAIN").description("ddd").defaultValue("").required(true)
    val minInstancesPerNode=new PropertyDescriptor().name("minInstancesPerNode").displayName("MIN_INSTANCES_PER_NODE").description("ddd").defaultValue("").required(true)
    val impurity=new PropertyDescriptor().name("impurity").displayName("IMPURITY").description("Criterion used for information gain calculation (case-insensitive). Supported: \"entropy\" and \"gini\". (default = gini)").defaultValue("").required(true)
    descriptor = training_data_path :: descriptor
    descriptor = model_save_path :: descriptor
    descriptor = maxBins :: descriptor
    descriptor = maxDepth :: descriptor
    descriptor = minInfoGain :: descriptor
    descriptor = minInstancesPerNode :: descriptor
    descriptor = impurity :: descriptor
    descriptor
  }
  override def getIcon(): Array[Byte] = ???
  override def getGroup(): List[String] = {
    List(StopGroupEnum.MLGroup.toString)
  }
 }
--- a/piflow-bundle/src/test/scala/cn/piflow/bundle/FlowTest_XX.scala
+++ b/piflow-bundle/src/test/scala/cn/piflow/bundle/FlowTest_XX.scala
@ -14,7 +14,7 @@ class FlowTest_XX {
  def testFlow(): Unit ={
    //parse flow json
-    val file = "src/main/resources/logistic.json"
+    val file = "src/main/resources/decisiontree.json"
    val flowJsonStr = FileUtil.fileReader(file)
    val map = OptionUtil.getAny(JSON.parseFull(flowJsonStr)).asInstanceOf[Map[String, Any]]
    println(map)
@ -30,7 +30,7 @@ class FlowTest_XX {
      .config("spark.driver.memory", "1g")
      .config("spark.executor.memory", "2g")
      .config("spark.cores.max", "2")
-      .config("spark.jars","/root/xx/piflow/out/artifacts/piflow_jar/piflow_jar.jar")
+      .config("spark.jars","/home/xx/piflow/out/artifacts/piflow_jar/piflow_jar.jar")
      .enableHiveSupport()
      .getOrCreate()
@ -49,7 +49,7 @@ class FlowTest_XX {
  def testFlow2json() = {
    //parse flow json
-    val file = "src/main/resources/logistic.json"
+    val file = "src/main/resources/decisiontree.json"
    val flowJsonStr = FileUtil.fileReader(file)
    val map = OptionUtil.getAny(JSON.parseFull(flowJsonStr)).asInstanceOf[Map[String, Any]]