Fix xgboost CSV data train bug (#2863)

sneaxiy · web-flow · commit 41a0db025c02 · 2020-08-22T01:18:30.000-05:00
* fix pai xgb bug

* fix shape error

* fix tf train generator

* add e2e ut

* polish

* fix python derivation
diff --git a/go/cmd/sqlflowserver/e2e_common_cases.go b/go/cmd/sqlflowserver/e2e_common_cases.go
@@ -958,3 +958,63 @@ func caseTestOptimizeClauseWithGroupBy(t *testing.T) {
 	a.True(reflect.DeepEqual(decodedRows[2], []interface{}{"plantB", "marketA", int64(30)}))
 	a.True(reflect.DeepEqual(decodedRows[3], []interface{}{"plantB", "marketB", int64(60)}))
 }
+
+func caseEnd2EndXGBoostDenseFeatureColumn(t *testing.T, isPai bool) {
+	trainTableName := "feature_derivation_case.train"
+	modelName := "feature_derivation_case.xgb_dense_column_model"
+	predictTableName := "feature_derivation_case.xgb_dense_column_predict_table"
+	evaluateTableName := "feature_derivation_case.xgb_dense_column_evaluate_table"
+
+	if isPai {
+		trainTableName = caseDB + ".feature_derivation_train"
+		modelName = "my_xgb_dense_column_model"
+		predictTableName = caseDB + ".xgb_dense_column_predict_table"
+		evaluateTableName = caseDB + ".xgb_dense_column_evaluate_table"
+	}
+
+	sqlTemplate := `SELECT c3, class FROM %[1]s
+TO TRAIN xgboost.gbtree
+WITH objective="binary:logistic", 
+    validation.select="SELECT c3, class FROM %[1]s", 
+    train.num_boost_round=100,
+    eta=0.3,
+    max_depth=5
+column DENSE(c3, 4)
+LABEL class
+INTO %[2]s;
+
+SELECT c3 FROM %[1]s TO PREDICT %[3]s.class USING %[2]s;
+
+SELECT * FROM %[3]s;
+
+SELECT c3, class FROM %[1]s
+TO EVALUATE %[2]s
+WITH
+	validation.metrics="accuracy_score,f1_score"
+LABEL class
+INTO %[4]s; 
+
+SELECT * FROM %[4]s;`
+
+	const selectTrainTableSQL = `SELECT * FROM %[2]s;`
+
+	if !isPai {
+		sqlTemplate += selectTrainTableSQL
+	}
+
+	sqls := fmt.Sprintf(sqlTemplate, trainTableName, modelName, predictTableName, evaluateTableName)
+
+	a := assert.New(t)
+	for _, sql := range strings.Split(sqls, ";") {
+		sql := strings.TrimSpace(sql)
+		if sql == "" {
+			continue
+		}
+
+		sql += ";"
+		_, _, _, err := connectAndRunSQL(sql)
+		if err != nil {
+			a.Fail(fmt.Sprintf("Run SQL failure:\n%s\n%s", sql, err.Error()))
+		}
+	}
+}
diff --git a/go/cmd/sqlflowserver/e2e_mysql_test.go b/go/cmd/sqlflowserver/e2e_mysql_test.go
@@ -107,6 +107,9 @@ func TestEnd2EndMySQL(t *testing.T) {
 	t.Run("CaseEnd2EndCrossFeatureColumn", caseEnd2EndCrossFeatureColumn)
 
 	t.Run("CaseXGBoostSparseKeyValueColumn", caseXGBoostSparseKeyValueColumn)
+	t.Run("CaseEnd2EndXGBoostDenseFeatureColumn", func(t *testing.T) {
+		caseEnd2EndXGBoostDenseFeatureColumn(t, false)
+	})
 
 	// Cases for optimize
 	t.Run("CaseTestOptimizeClauseWithoutGroupBy", caseTestOptimizeClauseWithoutGroupBy)
diff --git a/go/cmd/sqlflowserver/e2e_pai_maxcompute_test.go b/go/cmd/sqlflowserver/e2e_pai_maxcompute_test.go
@@ -507,6 +507,9 @@ func TestEnd2EndMaxComputePAI(t *testing.T) {
 		// FIXME(typhoonzero): Add this test back when we solve error: model already exist issue on the CI.
 		// t.Run("CaseTrainPAIRandomForests", CaseTrainPAIRandomForests)
 		t.Run("CaseXGBoostSparseKeyValueColumn", caseXGBoostSparseKeyValueColumn)
+		t.Run("CaseEnd2EndXGBoostDenseFeatureColumn", func(t *testing.T) {
+			caseEnd2EndXGBoostDenseFeatureColumn(t, true)
+		})
 	})
 
 }
diff --git a/go/cmd/sqlflowserver/testing.go b/go/cmd/sqlflowserver/testing.go
@@ -237,7 +237,8 @@ func prepareTestData(dbStr string) error {
 		datasets = append(datasets,
 			fmt.Sprintf(testdata.IrisMaxComputeSQL, caseDB),
 			fmt.Sprintf(testdata.ChurnMaxComputeSQL, caseDB),
-			fmt.Sprintf(testdata.XGBoostMaxComputeSparseDataCaseSQL, caseDB))
+			fmt.Sprintf(testdata.XGBoostMaxComputeSparseDataCaseSQL, caseDB),
+			fmt.Sprintf(testdata.FeatureDerivationCaseSQLMaxCompute, caseDB))
 	default:
 		return fmt.Errorf("unrecognized SQLFLOW_TEST_DB %s", db)
 	}
diff --git a/go/ir/derivation.go b/go/ir/derivation.go
@@ -138,7 +138,15 @@ func fillCSVFieldDesc(cellData string, fieldDescMap FieldDescMap, fieldName stri
 		size *= s
 	}
 
-	values := strings.Split(cellData, ",")
+	rawValues := strings.Split(cellData, ",")
+	values := make([]string, 0, len(rawValues))
+	for _, value := range rawValues {
+		trimmedValue := strings.TrimSpace(value)
+		if trimmedValue != "" {
+			values = append(values, trimmedValue)
+		}
+	}
+
 	// set shape only when the column is "DENSE"
 	if fieldDescMap[fieldName].IsSparse == false && fieldDescMap[fieldName].Shape == nil {
 		fieldDescMap[fieldName].Shape = []int{len(values)}
@@ -224,7 +232,7 @@ func inferStringDataFormat(strData string) string {
 	const realNumberRegex = "((\\+|-)?([0-9]+)(\\.[0-9]+)?)|((\\+|-)?\\.?[0-9]+)"
 
 	// string in the form of "3,5,7"
-	csvRegex := regexp.MustCompile(fmt.Sprintf("^((%s)\\,)+(%s)$", realNumberRegex, realNumberRegex))
+	csvRegex := regexp.MustCompile(fmt.Sprintf("^\\s*((%s)\\s*\\,\\s*)+(%s)\\s*(\\,?)\\s*$", realNumberRegex, realNumberRegex))
 	if csvRegex.MatchString(strData) {
 		return csv
 	}
diff --git a/go/ir/derivation_test.go b/go/ir/derivation_test.go
@@ -27,6 +27,11 @@ func TestCSVRegex(t *testing.T) {
 		"1,2,3,4",
 		"1.3,-3.2,132,32",
 		"33,-33",
+		"33,-33,",
+		" 33 , -70 , 80 , ",
+		" 33 , -70 , 80 ,",
+		" 33 , -70 , 80, ",
+		" 33 , -70 , 80,",
 	}
 	for _, s := range csvStings {
 		if inferStringDataFormat(s) != csv {
diff --git a/go/sql/testdata/feature_derivation_case.go b/go/sql/testdata/feature_derivation_case.go
@@ -14,7 +14,7 @@
 package testdata
 
 // FeatureDerivationCaseSQL is .sql format data samples to test feature derivation.
-var FeatureDerivationCaseSQL = `CREATE DATABASE IF NOT EXISTS feature_derivation_case;
+const FeatureDerivationCaseSQL = `CREATE DATABASE IF NOT EXISTS feature_derivation_case;
 DROP TABLE IF EXISTS feature_derivation_case.train;
 CREATE TABLE feature_derivation_case.train (
        c1 float,
@@ -25,14 +25,14 @@ CREATE TABLE feature_derivation_case.train (
        c6 CHAR(255),
        class TINYINT);
 INSERT INTO feature_derivation_case.train VALUES
-(6.4,2.8, '1,4,2,3', '1,3,2,6', '3,140', 'MALE', 0),
-(5.0,2.3, '1,3,8,3', '3,2,5,3', '93,12,1,392,49,13,398', 'FEMALE', 1),
+(6.4,2.8, '1,4,2,3,', '1,3,2,6', '3,140', 'MALE', 0),
+(5.0,2.3, '1,3,8,3,', '3,2,5,3', '93,12,1,392,49,13,398', 'FEMALE', 1),
 (4.9,2.5, '9,2,2,2', '1.2,4.8,3.2,1', '10,11,32,32,1', 'FEMALE', 1),
 (5.1,2.2, '2,1,8,5', '5.0,3,2,1', '23,22,1', 'FEMALE', 1),
 (4.8,3.1, '3,3,2,6', '3,2,3,5', '30,3,1,32', 'NULL', 0);`
 
 // FeatureDerivationCaseSQLHive is .sql format data samples to test feature derivation.
-var FeatureDerivationCaseSQLHive = `CREATE DATABASE IF NOT EXISTS feature_derivation_case;
+const FeatureDerivationCaseSQLHive = `CREATE DATABASE IF NOT EXISTS feature_derivation_case;
 DROP TABLE IF EXISTS feature_derivation_case.train;
 CREATE TABLE feature_derivation_case.train (
        c1 float,
@@ -48,3 +48,22 @@ INSERT INTO TABLE feature_derivation_case.train VALUES
 (4.9,2.5, '9,2,2,2', '1.2,4.8,3.2,1', '10,11,32,32,1', 'FEMALE', 1),
 (5.1,2.2, '2,1,8,5', '5.0,3,2,1', '23,22,1', 'FEMALE', 1),
 (4.8,3.1, '3,3,2,6', '3,2,3,5', '30,3,1,32', 'NULL', 0);`
+
+// FeatureDerivationCaseSQLMaxCompute is .sql format data samples to test feature derivation on MaxCompute.
+const FeatureDerivationCaseSQLMaxCompute = `
+DROP TABLE IF EXISTS %[1]s.feature_derivation_train;
+CREATE TABLE %[1]s.feature_derivation_train (
+       c1 DOUBLE,
+       c2 DOUBLE,
+       c3 STRING,
+       c4 STRING,
+       c5 STRING,
+       c6 STRING,
+       class INT);
+INSERT INTO %[1]s.feature_derivation_train VALUES
+(6.4,2.8, '1,4,2,3,', '1,3,2,6', '3,140', 'MALE', 0),
+(5.0,2.3, '1,3,8,3,', '3,2,5,3', '93,12,1,392,49,13,398', 'FEMALE', 1),
+(4.9,2.5, '9,2,2,2', '1.2,4.8,3.2,1', '10,11,32,32,1', 'FEMALE', 1),
+(5.1,2.2, '2,1,8,5', '5.0,3,2,1', '23,22,1', 'FEMALE', 1),
+(4.8,3.1, '3,3,2,6', '3,2,3,5', '30,3,1,32', 'NULL', 0);
+`
diff --git a/python/runtime/db.py b/python/runtime/db.py
@@ -49,16 +49,19 @@ def read_feature(raw_val, feature_spec, feature_name):
     elif feature_spec["delimiter"] != "":
         # Dense string vector
         if feature_spec["dtype"] == "float32":
-            return np.fromstring(raw_val,
-                                 dtype=np.float32,
-                                 sep=feature_spec["delimiter"])
+            vec = np.fromstring(raw_val,
+                                dtype=np.float32,
+                                sep=feature_spec["delimiter"])
         elif feature_spec["dtype"] == "int64":
-            return np.fromstring(raw_val,
-                                 dtype=np.int64,
-                                 sep=feature_spec["delimiter"])
+            vec = np.fromstring(raw_val,
+                                dtype=np.int64,
+                                sep=feature_spec["delimiter"])
         else:
             raise ValueError('unrecognize dtype {}'.format(
-                feature_spec[feature_name]["dtype"]))
+                feature_spec["dtype"]))
+
+        vec = vec.reshape(list(feature_spec["shape"]))
+        return vec,
     elif feature_spec["dtype"] == "float32":
         return float(raw_val),
     elif feature_spec["dtype"] == "int64":
diff --git a/python/runtime/feature/derivation.py b/python/runtime/feature/derivation.py
@@ -116,7 +116,7 @@ def new_default_field_desc(name):
 
 # A regular expression to match the form of "3,5,7"
 CSV_PATTERN = re.compile(
-    "((%s)\\,)+(%s)" %
+    "\\s*((%s)\\s*\\,\\s*)+(%s)\\s*(\\,?)\\s*" %
     (REAL_NUMBER_PATTERN.pattern, REAL_NUMBER_PATTERN.pattern))
 
 # A regular expression to match the form of "0:3.2 7:-2.3"
@@ -160,7 +160,13 @@ def fill_csv_field_desc(cell, field_desc):
     Returns:
         None.
     """
-    values = cell.split(",")
+    raw_values = cell.split(",")
+    values = []
+    for v in raw_values:
+        v = v.strip()
+        if v:
+            values.append(v)
+
     if field_desc.is_sparse:
         assert field_desc.shape is not None, \
             "the shape of CSV format data must be given"
diff --git a/python/runtime/feature/derivation_test.py b/python/runtime/feature/derivation_test.py
@@ -28,6 +28,11 @@ def test_csv_strings(self):
             "1,2,3,4",
             "1.3,-3.2,132,32",
             "33,-33",
+            "33,-33,",
+            " 33 , -70 , 80 , ",
+            " 33 , -70 , 80 ,",
+            " 33 , -70 , 80, ",
+            " 33 , -70 , 80,",
         ]
 
         for s in csv_strs:
diff --git a/python/runtime/tensorflow/input_fn.py b/python/runtime/tensorflow/input_fn.py
@@ -61,6 +61,12 @@ def reader():
             features = db.read_features_from_row(row, selected_cols,
                                                  feature_column_names,
                                                  feature_metas)
+            features = list(features)
+            for i, f in enumerate(features):
+                if len(f) == 1 and isinstance(f[0], np.ndarray):
+                    features[i] = f[0]
+            features = tuple(features)
+
             if label is None:
                 yield (features, )
             else:
diff --git a/python/runtime/tensorflow/predict.py b/python/runtime/tensorflow/predict.py
@@ -160,12 +160,9 @@ def add_to_example(example, x, i):
         feature_name = feature_column_names[i]
         dtype_str = feature_metas[feature_name]["dtype"]
         if feature_metas[feature_name]["delimiter"] != "":
-            if feature_metas[feature_name]["is_sparse"]:
-                # NOTE(typhoonzero): sparse feature will get
-                # (indices,values,shape) here, use indices only
-                values = x[0][i][0].flatten()
-            else:
-                values = x[0][i].flatten()
+            # NOTE(typhoonzero): sparse feature will get
+            # (indices,values,shape) here, use indices only
+            values = x[0][i][0].flatten()
             if dtype_str == "float32" or dtype_str == "float64":
                 example.features.feature[feature_name].float_list.value.extend(
                     list(values))
diff --git a/python/runtime/xgboost/evaluate.py b/python/runtime/xgboost/evaluate.py
@@ -16,6 +16,7 @@
 import xgboost as xgb
 from runtime import db
 from runtime.dbapi.paiio import PaiIOConnection
+from runtime.model.metadata import load_metadata
 from runtime.xgboost.dataset import xgb_dataset
 
 SKLEARN_METRICS = [
@@ -76,6 +77,8 @@ def evaluate(datasource,
                         )  # NOTE: default to use external memory
     bst = xgb.Booster({'nthread': 4})  # init model
     bst.load_model("my_model")  # load model
+    if not model_params:
+        model_params = load_metadata("model_meta.json")["attributes"]
     print("Start evaluating XGBoost model...")
     feature_file_id = 0
     for pred_dmatrix in dpred:
diff --git a/python/runtime/xgboost/predict.py b/python/runtime/xgboost/predict.py
@@ -15,6 +15,7 @@
 import xgboost as xgb
 from runtime import db
 from runtime.dbapi.paiio import PaiIOConnection
+from runtime.model.metadata import load_metadata
 from runtime.xgboost.dataset import xgb_dataset
 
 DEFAULT_PREDICT_BATCH_SIZE = 10000
@@ -55,6 +56,8 @@ def pred(datasource,
     bst = xgb.Booster({'nthread': 4})  # init model
     bst.load_model("my_model")  # load data
     print("Start predicting XGBoost model...")
+    if not model_params:
+        model_params = load_metadata("model_meta.json")["attributes"]
 
     selected_cols = db.selected_cols(conn, select)
 
@@ -75,9 +78,6 @@ def predict_and_store_result(bst, dpred, feature_file_id, model_params,
                              feature_column_names, feature_metas, is_pai, conn,
                              result_table):
     preds = bst.predict(dpred)
-
-    # TODO(yancey1989): should save train_params and model_params
-    # not only on PAI submitter
     # TODO(yancey1989): output the original result for various
     # objective function.
     if model_params:

Original file line number	Diff line number	Diff line change
`@@ -507,6 +507,9 @@ func TestEnd2EndMaxComputePAI(t *testing.T) {`
`507`	`507`	`// FIXME(typhoonzero): Add this test back when we solve error: model already exist issue on the CI.`
`508`	`508`	`// t.Run("CaseTrainPAIRandomForests", CaseTrainPAIRandomForests)`
`509`	`509`	`t.Run("CaseXGBoostSparseKeyValueColumn", caseXGBoostSparseKeyValueColumn)`
	`510`	`+ t.Run("CaseEnd2EndXGBoostDenseFeatureColumn", func(t *testing.T) {`
	`511`	`+ caseEnd2EndXGBoostDenseFeatureColumn(t, true)`
	`512`	`+ })`
`510`	`513`	`})`
`511`	`514`
`512`	`515`	`}`