Add XGBoost evaluation codegen (#2867)

sneaxiy · web-flow · commit 6cd7b24e6993 · 2020-08-26T23:14:41.000+08:00
* add evaluate codegen

* update

* update __init__.py fix flake8

* polish py db code
diff --git a/go/cmd/sqlflowserver/e2e_workflow_test.go b/go/cmd/sqlflowserver/e2e_workflow_test.go
@@ -409,6 +409,15 @@ TO PREDICT iris.test_result_table.class
 USING sqlflow_models.xgb_classification;
 
 SELECT * FROM iris.test_result_table;
+
+SELECT * FROM iris.test
+TO EVALUATE sqlflow_models.xgb_classification
+WITH
+	validation.metrics="accuracy_score"
+LABEL class
+INTO iris.evaluate_result_table;
+
+SELECT * FROM iris.evaluate_result_table;
 `
 	testMain(extraTrainSQLProgram + sqlProgram)
 	testMain(sqlProgram)
diff --git a/go/codegen/experimental/codegen_couler.go b/go/codegen/experimental/codegen_couler.go
@@ -52,7 +52,7 @@ func GenerateCodeCouler(sqlProgram string, session *pb.Session) (string, error)
 	if err != nil {
 		return "", err
 	}
-	stepList := make([]*stepContext, 0)
+	var stepList []*stepContext
 	for idx, stmt := range stmts {
 		stepCode, image, err := generateStepCodeAndImage(stmt, idx, session, stmts)
 		if err != nil {
diff --git a/go/codegen/experimental/codegen_step.go b/go/codegen/experimental/codegen_step.go
@@ -34,6 +34,8 @@ func generateStepCodeAndImage(sqlStmt ir.SQLFlowStmt, stepIndex int, session *pb
 		return generateTrainCodeAndImage(stmt, stepIndex, session)
 	case *ir.PredictStmt:
 		return generatePredictCodeAndImage(stmt, stepIndex, session, sqlStmts)
+	case *ir.EvaluateStmt:
+		return generateEvaluationCodeAndImage(stmt, stepIndex, session, sqlStmts)
 	case *ir.NormalStmt:
 		code, err := generateNormalStmtStep(string(*stmt), stepIndex, session)
 		return code, "", err
@@ -55,9 +57,9 @@ func generateTrainCodeAndImage(trainStmt *ir.TrainStmt, stepIndex int, session *
 }
 
 func generatePredictCodeAndImage(predStmt *ir.PredictStmt, stepIndex int, session *pb.Session, sqlStmts []ir.SQLFlowStmt) (string, string, error) {
-	trainStmt := findModelGenerationTrainStmt(predStmt.Using, stepIndex, sqlStmts)
 	image := ""
 	isXGBoost := false
+	trainStmt := findModelGenerationTrainStmt(predStmt.Using, stepIndex, sqlStmts)
 	if trainStmt != nil {
 		image = trainStmt.ModelImage
 		isXGBoost = isXGBoostEstimator(trainStmt.Estimator)
@@ -80,6 +82,32 @@ func generatePredictCodeAndImage(predStmt *ir.PredictStmt, stepIndex int, sessio
 	return "", "", fmt.Errorf("not implemented model type")
 }
 
+func generateEvaluationCodeAndImage(evalStmt *ir.EvaluateStmt, stepIndex int, session *pb.Session, sqlStmts []ir.SQLFlowStmt) (string, string, error) {
+	image := ""
+	isXGBoost := false
+	trainStmt := findModelGenerationTrainStmt(evalStmt.ModelName, stepIndex, sqlStmts)
+	if trainStmt != nil {
+		image = trainStmt.ModelImage
+		isXGBoost = isXGBoostEstimator(trainStmt.Estimator)
+	} else {
+		meta, err := getModelMetadata(session, evalStmt.ModelName)
+		if err != nil {
+			return "", "", err
+		}
+		image = meta.imageName()
+		isXGBoost = meta.isXGBoostModel()
+	}
+
+	if isXGBoost {
+		code, err := XGBoostGenerateEvaluation(evalStmt, stepIndex, session)
+		if err != nil {
+			return "", "", err
+		}
+		return code, image, nil
+	}
+	return "", "", fmt.Errorf("not implemented model type")
+}
+
 // findModelGenerationTrainStmt finds the *ir.TrainStmt that generates the model named `modelName`.
 // TODO(sneaxiy): find a better way to do this when we have a well designed dependency analysis.
 func findModelGenerationTrainStmt(modelName string, idx int, sqlStmts []ir.SQLFlowStmt) *ir.TrainStmt {
@@ -144,7 +172,7 @@ func getModelMetadataFromDB(dbConnStr, table string) (*metadata, error) {
 	if err != nil {
 		return nil, err
 	}
-	if readCnt != int(length) {
+	if uint64(readCnt) != length {
 		return nil, fmt.Errorf("invalid model metadata")
 	}
 	json, err := simplejson.NewJson(jsonBytes)
diff --git a/go/codegen/experimental/xgboost.go b/go/codegen/experimental/xgboost.go
@@ -208,6 +208,77 @@ def step_entry_{{.StepIndex}}():
              load='''{{.Load}}''')
 `
 
+type xgbEvaluateFiller struct {
+	StepIndex         int
+	DataSource        string
+	Select            string
+	ResultTable       string
+	PredLabelName     string
+	Load              string
+	ValidationMetrics string
+	Submitter         string
+}
+
+// XGBoostGenerateEvaluation generates the XGBoost evaluation code
+func XGBoostGenerateEvaluation(evalStmt *ir.EvaluateStmt, stepIndex int, session *pb.Session) (string, error) {
+	ds, err := GeneratePyDbConnStr(session)
+	if err != nil {
+		return "", err
+	}
+
+	labelName := ""
+	if nc, ok := evalStmt.Label.(*ir.NumericColumn); ok {
+		labelName = nc.FieldDesc.Name
+	} else {
+		return "", fmt.Errorf("unsupported label type %T", evalStmt.Label)
+	}
+
+	metricList := []string{"accuracy_score"}
+	if m, ok := evalStmt.Attributes["validation.metrics"]; ok {
+		if metricStr, ok := m.(string); ok {
+			metricList = []string{}
+			for _, s := range strings.Split(metricStr, ",") {
+				metricList = append(metricList, strings.TrimSpace(s))
+			}
+		} else {
+			return "", fmt.Errorf("validation.metrics must be of type string")
+		}
+	}
+	metricPyStr := ir.AttrToPythonValue(metricList)
+
+	filler := &xgbEvaluateFiller{
+		StepIndex:         stepIndex,
+		DataSource:        ds,
+		Select:            replaceNewLineRuneAndTrimSpace(evalStmt.Select),
+		ResultTable:       evalStmt.Into,
+		PredLabelName:     labelName,
+		Load:              evalStmt.ModelName,
+		ValidationMetrics: metricPyStr,
+		Submitter:         getSubmitter(session),
+	}
+
+	var program bytes.Buffer
+	tpl := template.Must(template.New("Evaluate").Parse(xgbEvaluateTemplate))
+	if err := tpl.Execute(&program, filler); err != nil {
+		return "", err
+	}
+	return program.String(), nil
+}
+
+const xgbEvaluateTemplate = `
+def step_entry_{{.StepIndex}}():
+    import runtime.temp_file as temp_file
+    from runtime.{{.Submitter}} import evaluate
+    
+    with temp_file.TemporaryDirectory(as_cwd=True):
+        evaluate(datasource='''{{.DataSource}}''', 
+                 select='''{{.Select}}''', 
+                 result_table='''{{.ResultTable}}''', 
+                 pred_label_name='''{{.PredLabelName}}''', 
+                 load='''{{.Load}}''',
+                 validation_metrics={{.ValidationMetrics}})
+`
+
 func getSubmitter(session *pb.Session) string {
 	if session.Submitter != "" {
 		return session.Submitter
diff --git a/python/runtime/local/__init__.py b/python/runtime/local/__init__.py
@@ -11,5 +11,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from runtime.local.submitter import submit_local_evaluate as evaluate  # noqa: F401, E501
 from runtime.local.submitter import submit_local_pred as pred  # noqa: F401
 from runtime.local.submitter import submit_local_train as train  # noqa: F401
diff --git a/python/runtime/local/submitter.py b/python/runtime/local/submitter.py
@@ -11,6 +11,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from runtime.local.xgboost_submitter.evaluate import \
+    evaluate as xgboost_evaluate
 from runtime.local.xgboost_submitter.predict import pred as xgboost_pred
 from runtime.local.xgboost_submitter.train import train as xgboost_train
 from runtime.model.model import EstimatorType, Model
@@ -74,5 +76,16 @@ def submit_local_pred(datasource, select, result_table, pred_label_name, load):
     if model.get_type() == EstimatorType.XGBOOST:
         xgboost_pred(datasource, select, result_table, pred_label_name, model)
     else:
-        raise NotImplementedError("not implemented model type: %s" %
-                                  model.get_type())
+        raise NotImplementedError("not implemented model type: {}".format(
+            model.get_type()))
+
+
+def submit_local_evaluate(datasource, select, result_table, pred_label_name,
+                          load, validation_metrics):
+    model = Model.load_from_db(datasource, load)
+    if model.get_type() == EstimatorType.XGBOOST:
+        xgboost_evaluate(datasource, select, result_table, model,
+                         pred_label_name, validation_metrics)
+    else:
+        raise NotImplementedError("not implemented model type: {}".format(
+            model.get_type()))
diff --git a/python/runtime/local/xgboost_submitter/evaluate.py b/python/runtime/local/xgboost_submitter/evaluate.py
@@ -16,6 +16,7 @@
 import numpy as np
 import runtime.temp_file as temp_file
 import runtime.xgboost as xgboost_extended
+import six
 import sklearn.metrics
 import xgboost as xgb
 from runtime import db
@@ -53,7 +54,7 @@
 def evaluate(datasource,
              select,
              result_table,
-             load,
+             model,
              pred_label_name=None,
              validation_metrics=["accuracy_score"]):
     """
@@ -63,14 +64,19 @@ def evaluate(datasource,
         datasource (str): the database connection string.
         select (str): the input data to predict.
         result_table (str): the output data table.
-        load (str): where the trained model stores.
+        model (Model|str): the model object or where to load the model.
         pred_label_name (str): the label column name.
         validation_metrics (list[str]): the evaluation metric names.
 
     Returns:
         None.
     """
-    model = Model.load_from_db(datasource, load)
+    if isinstance(model, six.string_types):
+        model = Model.load_from_db(datasource, model)
+    else:
+        assert isinstance(model,
+                          Model), "not supported model type %s" % type(model)
+
     model_params = model.get_meta("attributes")
     train_fc_map = model.get_meta("features")
     train_label_desc = model.get_meta("label").get_field_desc()[0]
diff --git a/python/runtime/model/db.py b/python/runtime/model/db.py
@@ -128,9 +128,9 @@ def _read_metadata(reader):
     return json.loads(metadata_json, cls=JSONDecoderWithFeatureColumn)
 
 
-def write_with_generator(datasource, table, gen, metadata):
+def write_with_generator_and_metadata(datasource, table, gen, metadata):
     """Write data into a table, the written data
-    comes from the input generator.
+    comes from the input generator and metadata.
 
     Args:
         datasource: string
@@ -176,9 +176,9 @@ def read_metadata_from_db(datasource, table):
     return metadata
 
 
-def read_with_generator(datasource, table, buff_size=256):
+def read_with_generator_and_metadata(datasource, table, buff_size=256):
     """Read data from a table, this function returns
-    a generator to yield the data.
+    a generator to yield the data, and the metadata dict.
 
     Args:
         datasource: string
@@ -188,20 +188,23 @@ def read_with_generator(datasource, table, buff_size=256):
         buff_size: int
             The buffer size to read data.
 
-    Returns: Generator
-        the generator yield row data of the table.
+    Returns: tuple(Generator, dict)
+        the generator yield row data of the table,
+        and the model metadata dict.
     """
+    conn = connect_with_data_source(datasource)
+    r = SQLFSReader(conn, table)
+    metadata = _read_metadata(r)
+
     def reader():
-        conn = connect_with_data_source(datasource)
-        with SQLFSReader(conn, table) as r:
-            _read_metadata(r)
-            while True:
-                buffer = r.read(buff_size)
-                if not buffer:
-                    break
+        while True:
+            buffer = r.read(buff_size)
+            if not buffer:
+                break
 
-                yield buffer
+            yield buffer
 
+        r.close()
         conn.close()
 
-    return reader
+    return reader, metadata
diff --git a/python/runtime/model/model.py b/python/runtime/model/model.py
@@ -19,8 +19,8 @@
 from runtime.feature.column import (JSONDecoderWithFeatureColumn,
                                     JSONEncoderWithFeatureColumn)
 from runtime.model import oss
-from runtime.model.db import (read_metadata_from_db, read_with_generator,
-                              write_with_generator)
+from runtime.model.db import (read_with_generator_and_metadata,
+                              write_with_generator_and_metadata)
 from runtime.model.tar import unzip_dir, zip_dir
 
 # archive the current work director into a tarball
@@ -177,8 +177,9 @@ def _gen():
 
                 return _gen
 
-            write_with_generator(datasource, table, _bytes_reader(tarball),
-                                 self._to_dict())
+            write_with_generator_and_metadata(datasource, table,
+                                              _bytes_reader(tarball),
+                                              self._to_dict())
 
     @staticmethod
     def load_from_db(datasource, table, local_dir=None):
@@ -199,14 +200,14 @@ def load_from_db(datasource, table, local_dir=None):
 
         with temp_file.TemporaryDirectory() as tmp_dir:
             tarball = os.path.join(tmp_dir, TARBALL_NAME)
-            gen = read_with_generator(datasource, table)
+            gen, metadata = read_with_generator_and_metadata(datasource, table)
             with open(tarball, "wb") as f:
                 for data in gen():
                     f.write(bytes(data))
 
             Model._unzip(local_dir, tarball, load_from_db=True)
 
-        return Model._from_dict(read_metadata_from_db(datasource, table))
+        return Model._from_dict(metadata)
 
     def save_to_oss(self, oss_model_dir, local_dir=None):
         """

Original file line number	Diff line number	Diff line change
`@@ -52,7 +52,7 @@ func GenerateCodeCouler(sqlProgram string, session *pb.Session) (string, error)`
`52`	`52`	`if err != nil {`
`53`	`53`	`return "", err`
`54`	`54`	`}`
`55`		`- stepList := make([]*stepContext, 0)`
	`55`	`+ var stepList []*stepContext`
`56`	`56`	`for idx, stmt := range stmts {`
`57`	`57`	`stepCode, image, err := generateStepCodeAndImage(stmt, idx, session, stmts)`
`58`	`58`	`if err != nil {`