Add xgboost predict method (#2835)

sneaxiy · web-flow · commit 277ab0f2bde9 · 2020-08-18T16:34:01.000+08:00
* add xgboost predict

* fix ut
diff --git a/python/runtime/db.py b/python/runtime/db.py
@@ -149,6 +149,27 @@ def read_features_from_row(row, select_cols, feature_column_names,
     return tuple(features)
 
 
+def to_db_field_type(driver, dtype):
+    """
+    This method converts the dtype to a field type that the CREATE
+    TABLE statement accepts.
+
+    Args:
+        driver (str): the DBMS driver type.
+        dtype (str): the data type.
+
+    Returns:
+        A field type that the CREATE TABLE statement accepts.
+    """
+    if dtype in ["VARCHAR", "CHAR"]:
+        if driver == "mysql":
+            return dtype + "(255)"
+        else:
+            return "STRING"
+    else:
+        return dtype
+
+
 def db_generator(conn, statement, label_meta=None):
     def reader():
         rs = conn.query(statement)
diff --git a/python/runtime/feature/field_desc.py b/python/runtime/feature/field_desc.py
@@ -27,6 +27,32 @@ class DataType(object):
     FLOAT32 = 1
     STRING = 2
 
+    @staticmethod
+    def to_db_field_type(driver, dtype):
+        """
+        This method converts the dtype to a field type that the CREATE
+        TABLE statement accepts.
+
+        Args:
+            driver (str): the DBMS driver type.
+            dtype (enum): the data type. One of FLOAT32, INT64 and STRING.
+
+        Returns:
+            A field type that the CREATE TABLE statement accepts.
+        """
+        if dtype == DataType.INT64:
+            return "BIGINT"
+
+        if dtype == DataType.FLOAT32:
+            return "DOUBLE"
+
+        if dtype == DataType.STRING:
+            if driver == "mysql":
+                return "VARCHAR(255)"
+            return "STRING"
+
+        raise ValueError("unsupported data type {}".format(dtype))
+
 
 # DataFormat is used in FieldDesc to represent the data format
 # of a database field.
diff --git a/python/runtime/local/xgboost/__init__.py b/python/runtime/local/xgboost/__init__.py
@@ -11,4 +11,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from runtime.local.xgboost.predict import pred  # noqa: F401
 from runtime.local.xgboost.train import train  # noqa: F401
diff --git a/python/runtime/local/xgboost/predict.py b/python/runtime/local/xgboost/predict.py
@@ -0,0 +1,183 @@
+# Copyright 2020 The SQLFlow Authors. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tempfile
+
+import numpy as np
+import runtime.xgboost as xgboost_extended
+import xgboost as xgb
+from runtime import db
+from runtime.feature.compile import compile_ir_feature_columns
+from runtime.feature.derivation import get_ordered_field_descs
+from runtime.feature.field_desc import DataType
+from runtime.model.model import Model
+from runtime.xgboost.dataset import xgb_dataset
+
+
+def pred(datasource, select, result_table, pred_label_name, load):
+    """
+    Do prediction using a trained model.
+
+    Args:
+        datasource (str): the database connection string.
+        select (str): the input data to predict.
+        result_table (str): the output data table.
+        pred_label_name (str): the output label name to predict.
+        load (str): where the trained model stores.
+
+    Returns:
+        None.
+    """
+    model = Model.load_from_db(datasource, load)
+    model_params = model.get_meta("attributes")
+    train_fc_map = model.get_meta("features")
+    train_label_desc = model.get_meta("label").get_field_desc()[0]
+
+    field_descs = get_ordered_field_descs(train_fc_map)
+    feature_column_names = [fd.name for fd in field_descs]
+    feature_metas = dict([(fd.name, fd.to_dict()) for fd in field_descs])
+
+    # NOTE: in the current implementation, we are generating a transform_fn
+    # from the COLUMN clause. The transform_fn is executed during the process
+    # of dumping the original data into DMatrix SVM file.
+    compiled_fc = compile_ir_feature_columns(train_fc_map, model.get_type())
+    transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer(
+        feature_column_names, *compiled_fc["feature_columns"])
+
+    bst = xgb.Booster()
+    bst.load_model("my_model")
+
+    conn = db.connect_with_data_source(datasource)
+    result_column_names, train_label_idx = _create_predict_table(
+        conn, select, result_table, train_label_desc, pred_label_name)
+
+    with tempfile.TemporaryDirectory() as tmp_dir_name:
+        pred_fn = os.path.join(tmp_dir_name, "predict.txt")
+        raw_data_dir = os.path.join(tmp_dir_name, "predict_raw_dir")
+
+        dpred = xgb_dataset(
+            datasource=datasource,
+            fn=pred_fn,
+            dataset_sql=select,
+            feature_metas=feature_metas,
+            feature_column_names=feature_column_names,
+            label_meta=None,
+            cache=True,
+            batch_size=10000,
+            transform_fn=transform_fn,
+            raw_data_dir=raw_data_dir)  # NOTE: default to use external memory
+
+        print("Start predicting XGBoost model...")
+        for idx, pred_dmatrix in enumerate(dpred):
+            feature_file_name = os.path.join(
+                tmp_dir_name, "predict_raw_dir/predict.txt_%d" % idx)
+            _predict_and_store_result(bst, pred_dmatrix, model_params,
+                                      result_table, result_column_names,
+                                      train_label_idx, feature_file_name, conn)
+        print("Done predicting. Predict table : %s" % result_table)
+
+    conn.close()
+
+
+def _predict_and_store_result(bst, dpred, model_params, result_table,
+                              result_column_names, train_label_idx,
+                              feature_file_name, conn):
+    """
+    Do prediction and save the prediction result in the table.
+
+    Args:
+        bst: the XGBoost booster object.
+        dpred: the XGBoost DMatrix input data to predict.
+        model_params (dict): the XGBoost model parameters.
+        result_table (str): the result table name.
+        result_column_names (list[str]): the result column names.
+        train_label_idx (int): the index where the trained label is inside
+            result_column_names.
+        feature_file_name (str): the file path where the feature dumps.
+        conn: the database connection object.
+
+    Returns:
+        None.
+    """
+    preds = bst.predict(dpred)
+
+    # TODO(yancey1989): should save train_params and model_params
+    # not only on PAI submitter
+    # TODO(yancey1989): output the original result for various
+    # objective function.
+    objective = model_params.get("objective", "")
+    if objective.startswith("binary:"):
+        preds = (preds > 0.5).astype(np.int64)
+    elif objective.startswith("multi:") and len(preds) == 2:
+        preds = np.argmax(np.array(preds), axis=1)
+
+    with db.buffered_db_writer(conn, result_table, result_column_names,
+                               100) as w:
+        with open(feature_file_name, "r") as feature_file_read:
+            line_no = 0
+            for line in feature_file_read.readlines():
+                if not line:
+                    break
+
+                row = [
+                    item for i, item in enumerate(line.strip().split("/"))
+                    if i != train_label_idx
+                ]
+                row.append(str(preds[line_no]))
+                w.write(row)
+                line_no += 1
+
+
+def _create_predict_table(conn, select, result_table, train_label_desc,
+                          pred_label_name):
+    """
+    Create the result prediction table.
+
+    Args:
+        conn: the database connection object.
+        select (str): the input data to predict.
+        result_table (str): the output data table.
+        train_label_desc (FieldDesc): the FieldDesc of the trained label.
+        pred_label_name (str): the output label name to predict.
+
+    Returns:
+        A tuple of (result_column_names, train_label_index).
+    """
+    name_and_types = db.selected_columns_and_types(conn, select)
+    train_label_index = -1
+    for i, (name, _) in enumerate(name_and_types):
+        if name == train_label_desc.name:
+            train_label_index = i
+            break
+
+    if train_label_index >= 0:
+        del name_and_types[train_label_index]
+
+    column_strs = []
+    for name, typ in name_and_types:
+        column_strs.append("%s %s" %
+                           (name, db.to_db_field_type(conn.driver, typ)))
+
+    train_label_field_type = DataType.to_db_field_type(conn.driver,
+                                                       train_label_desc.dtype)
+    column_strs.append("%s %s" % (pred_label_name, train_label_field_type))
+
+    drop_sql = "DROP TABLE IF EXISTS %s;" % result_table
+    create_sql = "CREATE TABLE %s (%s);" % (result_table,
+                                            ",".join(column_strs))
+    conn.execute(drop_sql)
+    conn.execute(create_sql)
+    result_column_names = [item[0] for item in name_and_types]
+    result_column_names.append(pred_label_name)
+    return result_column_names, train_label_index
diff --git a/python/runtime/local/xgboost/train_predict_test.py b/python/runtime/local/xgboost/train_predict_test.py
@@ -0,0 +1,104 @@
+# Copyright 2020 The SQLFlow Authors. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tempfile
+import unittest
+
+import runtime.db as db
+import runtime.testing as testing
+from runtime.feature.column import NumericColumn
+from runtime.feature.field_desc import FieldDesc
+from runtime.local.xgboost import pred, train
+
+
+class TestXGBoostTrain(unittest.TestCase):
+    def get_table_row_count(self, conn, table):
+        ret = list(conn.query("SELECT COUNT(*) FROM %s" % table))
+        self.assertEqual(len(ret), 1)
+        ret = ret[0]
+        self.assertEqual(len(ret), 1)
+        return ret[0]
+
+    def get_table_schema(self, conn, table):
+        name_and_types = conn.get_table_schema(table)
+        return dict(name_and_types)
+
+    @unittest.skipUnless(testing.get_driver() == "mysql",
+                         "skip non mysql tests")
+    def test_train_and_predict(self):
+        ds = testing.get_datasource()
+        original_sql = """SELECT * FROM iris.train
+        TO TRAIN xgboost.gbtree
+        WITH
+            objective="multi:softmax",
+            num_boost_round=20,
+            num_class=3,
+            validation.select="SELECT * FROM iris.test"
+        INTO iris.xgboost_train_model_test;
+        """
+
+        select = "SELECT * FROM iris.train"
+        val_select = "SELECT * FROM iris.test"
+        train_params = {"num_boost_round": 20}
+        model_params = {"num_class": 3, "objective": "multi:softmax"}
+        save_name = "iris.xgboost_train_model_test"
+        class_name = "class"
+
+        old_dir_name = os.getcwd()
+        with tempfile.TemporaryDirectory() as tmp_dir_name:
+            os.chdir(tmp_dir_name)
+            eval_result = train(original_sql=original_sql,
+                                model_image="sqlflow:step",
+                                estimator="xgboost.gbtree",
+                                datasource=ds,
+                                select=select,
+                                validation_select=val_select,
+                                model_params=model_params,
+                                train_params=train_params,
+                                feature_column_map=None,
+                                label_column=NumericColumn(
+                                    FieldDesc(name=class_name)),
+                                save=save_name)
+            self.assertLess(eval_result['train']['merror'][-1], 0.01)
+            self.assertLess(eval_result['validate']['merror'][-1], 0.01)
+
+            conn = db.connect_with_data_source(ds)
+
+            pred_select = "SELECT * FROM iris.test"
+            pred(ds, pred_select, "iris.predict_result_table", class_name,
+                 save_name)
+
+            self.assertEqual(
+                self.get_table_row_count(conn, "iris.test"),
+                self.get_table_row_count(conn, "iris.predict_result_table"))
+
+            schema1 = self.get_table_schema(conn, "iris.test")
+            schema2 = self.get_table_schema(conn, "iris.predict_result_table")
+            self.assertEqual(len(schema1), len(schema2))
+            for name in schema1:
+                if name == 'class':
+                    self.assertEqual(schema2[name], "BIGINT")
+                    continue
+
+                self.assertTrue(name in schema2)
+                self.assertEqual(schema1[name], schema2[name])
+
+            diff_schema = schema2.keys() - schema1.keys()
+            self.assertEqual(len(diff_schema), 0)
+
+        os.chdir(old_dir_name)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/runtime/local/xgboost/train_test.py b/python/runtime/local/xgboost/train_test.py
diff --git a/python/runtime/model/model.py b/python/runtime/model/model.py