Add xgboost evaluation local code (#2844)

sneaxiy · web-flow · commit a1c85a4509d6 · 2020-08-20T14:21:24.000+08:00
* add xgboost evaluation local code

* replace temp_file apis

* polish
diff --git a/python/runtime/local/xgboost_submitter/evaluate.py b/python/runtime/local/xgboost_submitter/evaluate.py
@@ -0,0 +1,194 @@
+# Copyright 2020 The SQLFlow Authors. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import numpy as np
+import runtime.temp_file as temp_file
+import runtime.xgboost as xgboost_extended
+import sklearn.metrics
+import xgboost as xgb
+from runtime import db
+from runtime.feature.compile import compile_ir_feature_columns
+from runtime.feature.derivation import get_ordered_field_descs
+from runtime.feature.field_desc import DataType
+from runtime.local.xgboost_submitter.predict import _calc_predict_result
+from runtime.model.model import Model
+from runtime.xgboost.dataset import xgb_dataset
+
+SKLEARN_METRICS = [
+    'accuracy_score',
+    'average_precision_score',
+    'balanced_accuracy_score',
+    'brier_score_loss',
+    'cohen_kappa_score',
+    'explained_variance_score',
+    'f1_score',
+    'fbeta_score',
+    'hamming_loss',
+    'hinge_loss',
+    'log_loss',
+    'mean_absolute_error',
+    'mean_squared_error',
+    'mean_squared_log_error',
+    'median_absolute_error',
+    'precision_score',
+    'r2_score',
+    'recall_score',
+    'roc_auc_score',
+    'zero_one_loss',
+]
+
+
+def evaluate(datasource,
+             select,
+             result_table,
+             load,
+             pred_label_name=None,
+             validation_metrics=["accuracy_score"]):
+    """
+    Do evaluation to a trained XGBoost model.
+
+    Args:
+        datasource (str): the database connection string.
+        select (str): the input data to predict.
+        result_table (str): the output data table.
+        load (str): where the trained model stores.
+        pred_label_name (str): the label column name.
+        validation_metrics (list[str]): the evaluation metric names.
+
+    Returns:
+        None.
+    """
+    model = Model.load_from_db(datasource, load)
+    model_params = model.get_meta("attributes")
+    train_fc_map = model.get_meta("features")
+    train_label_desc = model.get_meta("label").get_field_desc()[0]
+    if pred_label_name:
+        train_label_desc.name = pred_label_name
+
+    field_descs = get_ordered_field_descs(train_fc_map)
+    feature_column_names = [fd.name for fd in field_descs]
+    feature_metas = dict([(fd.name, fd.to_dict()) for fd in field_descs])
+
+    # NOTE: in the current implementation, we are generating a transform_fn
+    # from the COLUMN clause. The transform_fn is executed during the process
+    # of dumping the original data into DMatrix SVM file.
+    compiled_fc = compile_ir_feature_columns(train_fc_map, model.get_type())
+    transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer(
+        feature_column_names, *compiled_fc["feature_columns"])
+
+    bst = xgb.Booster()
+    bst.load_model("my_model")
+    conn = db.connect_with_data_source(datasource)
+
+    result_column_names = _create_evaluate_table(conn, result_table,
+                                                 validation_metrics)
+
+    with temp_file.TemporaryDirectory() as tmp_dir_name:
+        pred_fn = os.path.join(tmp_dir_name, "predict.txt")
+
+        dpred = xgb_dataset(datasource=datasource,
+                            fn=pred_fn,
+                            dataset_sql=select,
+                            feature_metas=feature_metas,
+                            feature_column_names=feature_column_names,
+                            label_meta=train_label_desc.to_dict(),
+                            cache=True,
+                            batch_size=10000,
+                            transform_fn=transform_fn)
+
+        for i, pred_dmatrix in enumerate(dpred):
+            feature_file_name = pred_fn + "_%d" % i
+            preds = _calc_predict_result(bst, pred_dmatrix, model_params)
+            _store_evaluate_result(preds, feature_file_name, train_label_desc,
+                                   result_table, result_column_names,
+                                   validation_metrics, conn)
+
+    conn.close()
+
+
+def _create_evaluate_table(conn, result_table, validation_metrics):
+    """
+    Create the result table to store the evaluation result.
+
+    Args:
+        conn: the database connection object.
+        result_table (str): the output data table.
+        validation_metrics (list[str]): the evaluation metric names.
+
+    Returns:
+        The column names of the created table.
+    """
+    result_columns = ['loss'] + validation_metrics
+    float_field_type = DataType.to_db_field_type(conn.driver, DataType.FLOAT32)
+    column_strs = [
+        "%s %s" % (name, float_field_type) for name in result_columns
+    ]
+
+    drop_sql = "DROP TABLE IF EXISTS %s;" % result_table
+    create_sql = "CREATE TABLE %s (%s);" % (result_table,
+                                            ",".join(column_strs))
+    conn.execute(drop_sql)
+    conn.execute(create_sql)
+
+    return result_columns
+
+
+def _store_evaluate_result(preds, feature_file_name, label_desc, result_table,
+                           result_column_names, validation_metrics, conn):
+    """
+    Save the evaluation result in the table.
+
+    Args:
+        preds: the prediction result.
+        feature_file_name (str): the file path where the feature dumps.
+        label_desc (FieldDesc): the label FieldDesc object.
+        result_table (str): the result table name.
+        result_column_names (list[str]): the result column names.
+        validation_metrics (list[str]): the evaluation metric names.
+        conn: the database connection object.
+
+    Returns:
+        None.
+    """
+    y_test = []
+    with open(feature_file_name, 'r') as f:
+        for line in f.readlines():
+            row = [i for i in line.strip().split("\t")]
+            # DMatrix store label in the first column
+            if label_desc.dtype == DataType.INT64:
+                y_test.append(int(row[0]))
+            elif label_desc.dtype == DataType.FLOAT32:
+                y_test.append(float(row[0]))
+            else:
+                raise TypeError("unsupported data type {}".format(
+                    label_desc.dtype))
+
+    y_test = np.array(y_test)
+
+    evaluate_results = dict()
+    for metric_name in validation_metrics:
+        metric_name = metric_name.strip()
+        if metric_name not in SKLEARN_METRICS:
+            raise ValueError("unsupported metrics %s" % metric_name)
+        metric_func = getattr(sklearn.metrics, metric_name)
+        metric_value = metric_func(y_test, preds)
+        evaluate_results[metric_name] = metric_value
+
+    # write evaluation result to result table
+    with db.buffered_db_writer(conn, result_table, result_column_names) as w:
+        row = ["0.0"]
+        for mn in validation_metrics:
+            row.append(str(evaluate_results[mn]))
+        w.write(row)
diff --git a/python/runtime/local/xgboost_submitter/local_submitter_test.py b/python/runtime/local/xgboost_submitter/local_submitter_test.py
@@ -11,14 +11,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
 import unittest
 
 import runtime.db as db
 import runtime.temp_file as temp_file
 import runtime.testing as testing
 from runtime.feature.column import NumericColumn
 from runtime.feature.field_desc import FieldDesc
+from runtime.local.xgboost_submitter.evaluate import evaluate
 from runtime.local.xgboost_submitter.predict import pred
 from runtime.local.xgboost_submitter.train import train
 
@@ -37,7 +37,7 @@ def get_table_schema(self, conn, table):
 
     @unittest.skipUnless(testing.get_driver() == "mysql",
                          "skip non mysql tests")
-    def test_train_and_predict(self):
+    def test_main(self):
         ds = testing.get_datasource()
         original_sql = """SELECT * FROM iris.train
         TO TRAIN xgboost.gbtree
@@ -56,7 +56,6 @@ def test_train_and_predict(self):
         save_name = "iris.xgboost_train_model_test"
         class_name = "class"
 
-        old_dir_name = os.getcwd()
         with temp_file.TemporaryDirectory(as_cwd=True):
             eval_result = train(original_sql=original_sql,
                                 model_image="sqlflow:step",
@@ -97,7 +96,12 @@ def test_train_and_predict(self):
             diff_schema = schema2.keys() - schema1.keys()
             self.assertEqual(len(diff_schema), 0)
 
-        os.chdir(old_dir_name)
+            evaluate(ds, pred_select, "iris.evaluate_result_table", save_name,
+                     'class', ['accuracy_score'])
+            eval_schema = self.get_table_schema(conn,
+                                                "iris.evaluate_result_table")
+            self.assertEqual(eval_schema.keys(),
+                             set(['loss', 'accuracy_score']))
 
 
 if __name__ == '__main__':
diff --git a/python/runtime/local/xgboost_submitter/predict.py b/python/runtime/local/xgboost_submitter/predict.py
@@ -88,33 +88,25 @@ def pred(datasource, select, result_table, pred_label_name, model):
         for idx, pred_dmatrix in enumerate(dpred):
             feature_file_name = os.path.join(
                 tmp_dir_name, "predict_raw_dir/predict.txt_%d" % idx)
-            _predict_and_store_result(bst, pred_dmatrix, model_params,
-                                      result_table, result_column_names,
-                                      train_label_idx, feature_file_name, conn)
+            preds = _calc_predict_result(bst, pred_dmatrix, model_params)
+            _store_predict_result(preds, result_table, result_column_names,
+                                  train_label_idx, feature_file_name, conn)
         print("Done predicting. Predict table : %s" % result_table)
 
     conn.close()
 
 
-def _predict_and_store_result(bst, dpred, model_params, result_table,
-                              result_column_names, train_label_idx,
-                              feature_file_name, conn):
+def _calc_predict_result(bst, dpred, model_params):
     """
-    Do prediction and save the prediction result in the table.
+    Calculate the prediction result.
 
     Args:
         bst: the XGBoost booster object.
         dpred: the XGBoost DMatrix input data to predict.
         model_params (dict): the XGBoost model parameters.
-        result_table (str): the result table name.
-        result_column_names (list[str]): the result column names.
-        train_label_idx (int): the index where the trained label is inside
-            result_column_names.
-        feature_file_name (str): the file path where the feature dumps.
-        conn: the database connection object.
 
     Returns:
-        None.
+        The prediction result.
     """
     preds = bst.predict(dpred)
 
@@ -128,8 +120,27 @@ def _predict_and_store_result(bst, dpred, model_params, result_table,
     elif objective.startswith("multi:") and len(preds) == 2:
         preds = np.argmax(np.array(preds), axis=1)
 
-    with db.buffered_db_writer(conn, result_table, result_column_names,
-                               100) as w:
+    return preds
+
+
+def _store_predict_result(preds, result_table, result_column_names,
+                          train_label_idx, feature_file_name, conn):
+    """
+    Save the prediction result in the table.
+
+    Args:
+        preds: the prediction result to save.
+        result_table (str): the result table name.
+        result_column_names (list[str]): the result column names.
+        train_label_idx (int): the index where the trained label is inside
+            result_column_names.
+        feature_file_name (str): the file path where the feature dumps.
+        conn: the database connection object.
+
+    Returns:
+        None.
+    """
+    with db.buffered_db_writer(conn, result_table, result_column_names) as w:
         with open(feature_file_name, "r") as feature_file_read:
             line_no = 0
             for line in feature_file_read.readlines():