Add feature column compilation (#2772)

sneaxiy · web-flow · commit f0f1d30aeee1 · 2020-07-30T14:02:00.000+08:00
* add feature_column compile

* follow comments

* update
diff --git a/python/runtime/feature/column.py b/python/runtime/feature/column.py
@@ -128,7 +128,7 @@ def num_class(self):
         return self.bucket_size
 
 
-class CategoryHashColumn(CategoryIDColumn):
+class CategoryHashColumn(CategoryColumn):
     """
     CategoryHashColumn represents a categorical hash feature column.
 
@@ -151,7 +151,7 @@ def num_class(self):
         return self.bucket_size
 
 
-class SeqCategoryIDColumn(CategoryIDColumn):
+class SeqCategoryIDColumn(CategoryColumn):
     """
     SeqCategoryIDColumn represents a sequential categorical id feature column.
 
diff --git a/python/runtime/feature/compile.py b/python/runtime/feature/compile.py
@@ -0,0 +1,168 @@
+# Copyright 2020 The SQLFlow Authors. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import six
+from runtime.feature.column import (BucketColumn, CategoryHashColumn,
+                                    CategoryIDColumn, CrossColumn,
+                                    EmbeddingColumn, IndicatorColumn,
+                                    NumericColumn, SeqCategoryIDColumn)
+from runtime.feature.field_desc import DataType
+from runtime.model import EstimatorType
+
+__all__ = [
+    'compile_ir_feature_columns',
+]
+
+
+def to_package_dtype(dtype, package):
+    """
+    Convert dtype to the data type accepted by the feature column
+    implementation packages including TensorFlow and XGBoost.
+
+    Args:
+        dtype (DataType): one of INT, FLOAT and STRING.
+        package (module): the Python package, including TensorFlow
+            and XGBoost feature column packages.
+
+    Returns:
+        The data type accepted by the feature column implementation
+        packages including TensorFlow and XGBoost.
+    """
+    if dtype == DataType.INT:
+        return package.dtypes.int64
+
+    if dtype == DataType.FLOAT:
+        return package.dtypes.float32
+
+    if dtype == DataType.STRING:
+        return package.dtypes.string
+
+    raise ValueError("unsupported data type {}".format(dtype))
+
+
+def compile_feature_column(ir_fc, model_type, package):
+    """
+    Compile an IR FeatureColumn object to a runtime feature column object.
+
+    Args:
+        ir_fc (FeatureColumn): the IR FeatureColumn object.
+        model_type (EstimatorType): one of TENSORFLOW and XGBOOST.
+        package (module): the Python package corresponding to the model_type.
+
+    Returns:
+        A runtime feature column object.
+    """
+    fc_package = package.feature_column
+
+    if isinstance(ir_fc, NumericColumn):
+        fd = ir_fc.get_field_desc()[0]
+        return fc_package.numeric_column(fd.name, shape=fd.shape)
+
+    if isinstance(ir_fc, BucketColumn):
+        source_fc = compile_feature_column(ir_fc.source_column, model_type,
+                                           package)
+        return fc_package.bucketized_column(source_fc,
+                                            boundaries=ir_fc.boundaries)
+
+    if isinstance(ir_fc, CategoryIDColumn):
+        fd = ir_fc.get_field_desc()[0]
+        if fd.vocabulary:
+            return fc_package.categorical_column_with_vocabulary_list(
+                key=fd.name, vocabulary_list=list(fd.vocabulary))
+        else:
+            return fc_package.categorical_column_with_identity(
+                key=fd.name, num_buckets=ir_fc.bucket_size)
+
+    if isinstance(ir_fc, SeqCategoryIDColumn):
+        assert model_type != EstimatorType.XGBOOST, \
+            "SEQ_CATEGORY_ID is not supported in XGBoost models"
+        fd = ir_fc.get_field_desc()[0]
+        return fc_package.sequence_categorical_column_with_identity(
+            key=fd.name, num_buckets=ir_fc.bucket_size)
+
+    if isinstance(ir_fc, CategoryHashColumn):
+        fd = ir_fc.get_field_desc()[0]
+        dtype = to_package_dtype(fd.dtype, package)
+        return fc_package.categorical_column_with_hash_bucket(
+            key=fd.name, hash_bucket_size=ir_fc.bucket_size, dtype=dtype)
+
+    if isinstance(ir_fc, CrossColumn):
+        assert model_type != EstimatorType.XGBOOST, \
+            "CROSS is not supported in XGBoost models"
+        key_strs = []
+        for key in ir_fc.keys:
+            if isinstance(key, six.string_types):
+                key_strs.append(key)
+            elif isinstance(key, NumericColumn):
+                fd = key.get_field_desc()[0]
+                size = np.prod(fd.shape) if fd.shape else 1
+                assert size == 1, "CROSS does not support shape not equal to 1"
+                key_strs.append(fd.name)
+            else:
+                raise ValueError(
+                    "field in CROSS must be of FeatureColumn or string type")
+
+        return fc_package.crossed_column(
+            key_strs, hash_bucket_size=ir_fc.hash_bucket_size)
+
+    if isinstance(ir_fc, EmbeddingColumn):
+        assert model_type != EstimatorType.XGBOOST, \
+            "EMBEDDING is not supported in XGBoost models"
+        category_column = compile_feature_column(ir_fc.category_column,
+                                                 model_type, package)
+        return fc_package.embedding_column(category_column,
+                                           dimension=ir_fc.dimension,
+                                           combiner=ir_fc.combiner)
+
+    if isinstance(ir_fc, IndicatorColumn):
+        category_column = compile_feature_column(ir_fc.category_column,
+                                                 model_type, package)
+        return fc_package.indicator_column(category_column)
+
+    raise ValueError("unsupport FeatureColumn %s" % type(ir_fc))
+
+
+def compile_ir_feature_columns(ir_features, model_type):
+    """
+    Compile an IR FeatureColumn map to a runtime feature column map.
+
+    Args:
+        ir_features (dict[str -> list[FeatureColumn]]): the IR FeatureColumn
+            map, where the key is the target name, e.g. "feature_columns",
+            and the element inside the list is the IR FeatureColumn object.
+        model_type (EstimatorType): one of TENSORFLOW and XGBOOST.
+
+    Returns:
+        A runtime feature column map, whose type is
+        dict[str -> list[RuntimeFeatureColumn]].
+    """
+    if model_type == EstimatorType.TENSORFLOW:
+        import tensorflow
+        package = tensorflow
+    elif model_type == EstimatorType.XGBOOST:
+        import runtime.xgboost
+        package = runtime.xgboost
+        assert len(ir_features) == 1 and "feature_columns" in ir_features, \
+            "XGBoost only supports 'feature_columns' as the feature target"
+    else:
+        raise ValueError("only support TensorFlow and XGBoost model")
+
+    all_fcs = dict()
+    for target, fc_list in ir_features.items():
+        fcs = [
+            compile_feature_column(fc, model_type, package) for fc in fc_list
+        ]
+        all_fcs[target] = fcs
+
+    return all_fcs
diff --git a/python/runtime/feature/compile_test.py b/python/runtime/feature/compile_test.py
@@ -0,0 +1,126 @@
+# Copyright 2020 The SQLFlow Authors. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from runtime.feature.column import (BucketColumn, CategoryHashColumn,
+                                    CategoryIDColumn, CrossColumn,
+                                    EmbeddingColumn, IndicatorColumn,
+                                    NumericColumn, SeqCategoryIDColumn)
+from runtime.feature.compile import compile_ir_feature_columns
+from runtime.feature.field_desc import DataType, FieldDesc
+from runtime.model import EstimatorType
+
+TENSORFLOW = EstimatorType.TENSORFLOW
+XGBOOST = EstimatorType.XGBOOST
+
+
+class TestFeatureColumnCompilation(unittest.TestCase):
+    def compile_fc(self, fc, model_type):
+        fc_dict = {"feature_columns": [fc]}
+        rt_fc_dict = compile_ir_feature_columns(fc_dict, model_type)
+        self.assertEqual(len(rt_fc_dict), 1)
+        self.assertTrue("feature_columns" in rt_fc_dict)
+        fc_list = rt_fc_dict.get("feature_columns")
+        self.assertEqual(len(fc_list), 1)
+        return fc_list[0]
+
+    def test_numeric_column(self):
+        nc = NumericColumn(FieldDesc(name='c1', shape=(2, 3)))
+
+        for model_type in [TENSORFLOW, XGBOOST]:
+            compiled_nc = self.compile_fc(nc, model_type)
+            self.assertEqual(compiled_nc.key, 'c1')
+            self.assertEqual(compiled_nc.shape, (2, 3))
+
+    def test_bucket_column(self):
+        nc = NumericColumn(FieldDesc(name='c1', shape=(1, )))
+        bc = BucketColumn(nc, (-10, -5, 3, 7))
+
+        for model_type in [TENSORFLOW, XGBOOST]:
+            compiled_bc = self.compile_fc(bc, model_type)
+            self.assertEqual(compiled_bc.source_column.key, 'c1')
+            self.assertEqual(compiled_bc.boundaries, (-10, -5, 3, 7))
+
+    def test_category_id_column(self):
+        cc = CategoryIDColumn(FieldDesc(name='c1'), 128)
+
+        for model_type in [TENSORFLOW, XGBOOST]:
+            compiled_cc = self.compile_fc(cc, model_type)
+            self.assertEqual(compiled_cc.key, 'c1')
+            self.assertEqual(compiled_cc.num_buckets, 128)
+
+        cc = CategoryIDColumn(FieldDesc(name='c1', vocabulary=set(['a', 'b'])),
+                              128)
+        for model_type in [TENSORFLOW, XGBOOST]:
+            compiled_cc = self.compile_fc(cc, model_type)
+            vocab = sorted(compiled_cc.vocabulary_list)
+            self.assertEqual(vocab, ['a', 'b'])
+
+    def test_seq_category_id_column(self):
+        scc = SeqCategoryIDColumn(FieldDesc(name='c1'), 64)
+        compiled_scc = self.compile_fc(scc, TENSORFLOW)
+        # NOTE: TensorFlow SeqCategoryIDColumn does not have key
+        # attribute
+        # self.assertEqual(compiled_scc.key, 'c1')
+        self.assertEqual(compiled_scc.num_buckets, 64)
+
+        with self.assertRaises(AssertionError):
+            self.compile_fc(scc, XGBOOST)
+
+    def test_category_hash_column(self):
+        chc = CategoryHashColumn(FieldDesc(name='c1', dtype=DataType.STRING),
+                                 32)
+        for model_type in [TENSORFLOW, XGBOOST]:
+            compiled_chc = self.compile_fc(chc, model_type)
+            self.assertEqual(compiled_chc.key, 'c1')
+            self.assertEqual(compiled_chc.hash_bucket_size, 32)
+
+    def test_cross_column(self):
+        cc = CrossColumn(['c1', NumericColumn(FieldDesc(name='c2'))], 4096)
+        compiled_cc = self.compile_fc(cc, TENSORFLOW)
+        self.assertEqual(list(compiled_cc.keys), ['c1', 'c2'])
+        self.assertEqual(compiled_cc.hash_bucket_size, 4096)
+
+        with self.assertRaises(AssertionError):
+            self.compile_fc(cc, XGBOOST)
+
+    def test_embedding_column(self):
+        chc = CategoryHashColumn(FieldDesc(name='c1', dtype=DataType.STRING),
+                                 32)
+        ec = EmbeddingColumn(category_column=chc, combiner='sum', dimension=23)
+
+        compiled_ec = self.compile_fc(ec, TENSORFLOW)
+        self.assertEqual(compiled_ec.combiner, 'sum')
+        self.assertEqual(compiled_ec.dimension, 23)
+
+        compiled_chc = compiled_ec.categorical_column
+        self.assertEqual(compiled_chc.key, 'c1')
+        self.assertEqual(compiled_chc.hash_bucket_size, 32)
+
+        with self.assertRaises(AssertionError):
+            self.compile_fc(ec, XGBOOST)
+
+    def test_indicator_column(self):
+        cc = CategoryIDColumn(FieldDesc(name='c1'), 128)
+        ic = IndicatorColumn(category_column=cc)
+
+        for model_type in [TENSORFLOW, XGBOOST]:
+            compiled_chc = self.compile_fc(ic, model_type)
+            compiled_cc = compiled_chc.categorical_column
+            self.assertEqual(compiled_cc.key, 'c1')
+            self.assertEqual(compiled_cc.num_buckets, 128)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/runtime/xgboost/feature_column.py b/python/runtime/xgboost/feature_column.py
@@ -89,8 +89,9 @@ def numeric_column(key, shape=(1, )):
 
 class BucketizedColumnTransformer(CategoricalColumnTransformer):
     def __init__(self, source_column, boundaries):
-        assert boundaries == sorted(
-            boundaries), "Boundaries must be sorted in ascending order"
+        for i in six.moves.range(len(boundaries) - 1):
+            assert boundaries[i] < boundaries[i+1], \
+                "Boundaries must be sorted in ascending order"
         self.source_column = source_column
         self.boundaries = boundaries