sql-machine-learning
diff --git a/‎python/runtime/pai/__init__.py‎
Lines changed: 4 additions & 4 deletions b/‎python/runtime/pai/__init__.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎python/runtime/pai/cluster_conf.py‎
Lines changed: 1 addition & 1 deletion b/‎python/runtime/pai/cluster_conf.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/runtime/pai/entry.py‎
Lines changed: 3 additions & 7 deletions b/‎python/runtime/pai/entry.py‎
Lines changed: 3 additions & 7 deletions
diff --git a/‎python/runtime/pai/pai_distributed.py‎
Lines changed: 2 additions & 2 deletions b/‎python/runtime/pai/pai_distributed.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎python/runtime/pai/random_forest.py‎
Lines changed: 7 additions & 5 deletions b/‎python/runtime/pai/random_forest.py‎
Lines changed: 7 additions & 5 deletions
diff --git a/‎python/runtime/pai/submitter.py‎
Lines changed: 34 additions & 27 deletions b/‎python/runtime/pai/submitter.py‎
Lines changed: 34 additions & 27 deletions
diff --git a/‎python/runtime/pai/submitter_test.py‎
Lines changed: 42 additions & 29 deletions b/‎python/runtime/pai/submitter_test.py‎
Lines changed: 42 additions & 29 deletions
@@ -11,7 +11,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from runtime.pai.submitter import submit_pai_evaluate as evaluate
-from runtime.pai.submitter import submit_pai_explain as explain
-from runtime.pai.submitter import submit_pai_predict as predict
-from runtime.pai.submitter import submit_pai_train as train
+from runtime.pai.submitter import submit_pai_evaluate as evaluate  # noqa: F401
+from runtime.pai.submitter import submit_pai_explain as explain  # noqa: F401
+from runtime.pai.submitter import submit_pai_predict as predict  # noqa: F401
+from runtime.pai.submitter import submit_pai_train as train  # noqa: F401
@@ -64,6 +64,6 @@ def get_cluster_config(attrs):
     else:
         raise SQLFlowDiagnostic("train.num_evaluator should only be 1 or 0")
     conf = {"ps": ps, "worker": worker}
-    if evaluator != None:
+    if evaluator is not None:
         conf["evaluator"] = evaluator
     return conf
@@ -11,34 +11,30 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
 import pickle
-import types
 from inspect import getargspec
 
-from runtime import oss
 from runtime.diagnostics import SQLFlowDiagnostic
 from runtime.pai.pai_distributed import define_tf_flags, set_oss_environs
 from runtime.pai.tensorflow.evaluate import evaluate as evaluate_tf
 from runtime.pai.tensorflow.explain import explain as explain_tf
 from runtime.pai.tensorflow.predict import predict as predict_tf
 from runtime.pai.tensorflow.train import train as train_tf
-from runtime.tensorflow import is_tf_estimator
 
 try:
-    #(TODO: lhw) split entry.py into multiple files,
+    # (TODO: lhw) split entry.py into multiple files,
     # so, we can only import needed packages
     from runtime.pai.xgboost.predict import predict as predict_xgb
     from runtime.pai.xgboost.train import train as train_xgb
     from runtime.pai.xgboost.explain import explain as explain_xgb
     from runtime.pai.xgboost.evaluate import evaluate as evaluate_xgb
-except:
+except:  # noqa: E722
     pass
 
 
 def call_fun(func, params):
     """Call a function with given params, entries in params will be treated
-    as func' param if the key matches some argument name. Do not support 
+    as func' param if the key matches some argument name. Do not support
     var-args in func.
 
     Arags:
 
@@ -20,7 +20,7 @@
 
 try:
     import tensorflow.compat.v1 as tf
-except:
+except:  # noqa: E722
     import tensorflow as tf
 
 # This module contain utilities for PAI distributed training.
@@ -104,7 +104,7 @@ def make_estimator_distributed_runconfig(FLAGS,
                                          is_distributed,
                                          save_checkpoints_steps=100):
     if is_distributed:
-        cluster, task_type, task_index = make_distributed_info_without_evaluator(
+        cluster, task_type, task_index = make_distributed_info_without_evaluator(  # noqa: E501
             FLAGS)
         dump_into_tf_config(cluster, task_type, task_index)
         device_filters = None
 
@@ -48,15 +48,17 @@ def get_explain_random_forest_pai_cmd(datasource, model_name, data_table,
         datasource: current datasoruce
         model_name: model name on PAI
         data_table: input data table name
-        result_table: name of the result table, PAI will automatically create this table
+        result_table: name of the result table, PAI will automatically
+            create this table
         label_column: name of the label column
-    
+
     Returns:
         A string which is a PAI cmd
     """
-    # NOTE(typhoonzero): for PAI random forests predicting, we can not load the TrainStmt
-    # since the model saving is fully done by PAI. We directly use the columns in SELECT
-    # statement for prediction, error will be reported by PAI job if the columns not match.
+    # NOTE(typhoonzero): for PAI random forests predicting, we can not load
+    # the TrainStmt since the model saving is fully done by PAI. We directly
+    # use the columns in SELECT statement for prediction, error will be
+    # reported by PAI job if the columns not match.
     if not label_column:
         return ("must specify WITH label_column when using "
                 "pai random forest to explain models")
 
@@ -106,7 +106,7 @@ def drop_tables(tables, datasource):
             if table != "":
                 drop_sql = "DROP TABLE IF EXISTS %s" % table
                 db.execute(conn, drop_sql)
-    except:
+    except:  # noqa: E722
         # odps will clear table itself, so even fail here, we do
         # not need to raise error
         print("Encounter error on drop tmp table")
@@ -311,7 +311,8 @@ def get_pai_tf_cmd(cluster_config, tarball, params_file, entry_file,
     job_name = "_".join(["sqlflow", model_name]).replace(".", "_")
     cf_quote = json.dumps(cluster_config).replace("\"", "\\\"")
 
-    # submit table should format as: odps://<project>/tables/<table >,odps://<project>/tables/<table > ...
+    # submit table should format as: odps://<project>/tables/<table >,
+    # odps://<project>/tables/<table > ...
     submit_tables = max_compute_table_url(train_table)
     if train_table != val_table and val_table:
         val_table = max_compute_table_url(val_table)
@@ -321,7 +322,8 @@ def get_pai_tf_cmd(cluster_config, tarball, params_file, entry_file,
         table = max_compute_table_url(res_table)
         output_tables = "-Doutputs=%s" % table
 
-    # NOTE(typhoonzero): use - DhyperParameters to define flags passing OSS credentials.
+    # NOTE(typhoonzero): use - DhyperParameters to define flags passing
+    # OSS credentials.
     # TODO(typhoonzero): need to find a more secure way to pass credentials.
     cmd = ("pai -name tensorflow1150 -project algo_public_dev "
            "-DmaxHungTimeBeforeGCInSeconds=0 -DjobName=%s -Dtags=dnn "
@@ -333,8 +335,8 @@ def get_pai_tf_cmd(cluster_config, tarball, params_file, entry_file,
     oss_checkpoint_configs = os.getenv("SQLFLOW_OSS_CHECKPOINT_CONFIG")
     if not oss_checkpoint_configs:
         raise SQLFlowDiagnostic(
-            "need to configure SQLFLOW_OSS_CHECKPOINT_CONFIG when submitting to PAI"
-        )
+            "need to configure SQLFLOW_OSS_CHECKPOINT_CONFIG when "
+            "submitting to PAI")
     ckpt_conf = json.loads(oss_checkpoint_configs)
     model_url = get_oss_model_url(oss_model_path)
     role_name = get_project_role_name(project)
@@ -406,7 +408,8 @@ def submit_pai_train(datasource, estimator_string, select, validation_select,
 
     Args:
         datasource: string
-            Like: odps://access_id:access_key@service.com/api?curr_project=test_ci&scheme=http
+            Like: odps://access_id:access_key@service.com/api?
+                         curr_project=test_ci&scheme=http
         estimator_string: string
             Tensorflow estimator name, Keras class name, or XGBoost
         select: string
@@ -489,8 +492,9 @@ def get_oss_saved_model_type_and_estimator(model_name, project):
         If model is TensorFlow model, return type and estimator name
         If model is XGBoost, or other PAI model, just return model type
     """
-    # FIXME(typhoonzero): if the model not exist on OSS, assume it's a random forest model
-    # should use a general method to fetch the model and see the model type.
+    # FIXME(typhoonzero): if the model not exist on OSS, assume it's a random
+    # forest model should use a general method to fetch the model and see the
+    # model type.
     bucket = oss.get_models_bucket()
     tf = bucket.object_exists(model_name + "/tensorflow_model_desc")
     if tf:
@@ -529,9 +533,10 @@ def get_pai_predict_cmd(cluster_conf, datasource, project, oss_model_path,
     Returns:
         The command to submit PAI prediction task
     """
-    # NOTE(typhoonzero): for PAI machine learning toolkit predicting, we can not load the TrainStmt
-    # since the model saving is fully done by PAI. We directly use the columns in SELECT
-    # statement for prediction, error will be reported by PAI job if the columns not match.
+    # NOTE(typhoonzero): for PAI machine learning toolkit predicting, we can
+    # not load the TrainStmt since the model saving is fully done by PAI.
+    # We directly use the columns in SELECT statement for prediction, error
+    # will be reported by PAI job if the columns not match.
     conn = db.connect_with_data_source(datasource)
     if model_type == EstimatorType.PAIML:
         schema = db.get_table_schema(conn, predict_table)
@@ -621,13 +626,13 @@ def submit_pai_predict(datasource, select, result_table, label_column,
     params = dict(locals())
 
     cwd = tempfile.mkdtemp(prefix="sqlflow", dir="/tmp")
-    # TODO(typhoonzero): Do **NOT** create tmp table when the select statement is like:
-    # "SELECT fields,... FROM table"
+    # TODO(typhoonzero): Do **NOT** create tmp table when the select statement
+    # is like: "SELECT fields,... FROM table"
     data_table = create_tmp_table_from_select(select, datasource)
     params["data_table"] = data_table
 
-    # format resultTable name to "db.table" to let the codegen form a submitting
-    # argument of format "odps://project/tables/table_name"
+    # format resultTable name to "db.table" to let the codegen form a
+    # submitting argument of format "odps://project/tables/table_name"
     project = get_project(datasource)
     if result_table.count(".") == 0:
         result_table = "%s.%s" % (project, result_table)
@@ -740,9 +745,10 @@ def get_explain_random_forests_cmd(datasource, model_name, data_table,
     Returns:
         a PAI cmd to explain the data using given model
     """
-    # NOTE(typhoonzero): for PAI random forests predicting, we can not load the TrainStmt
-    # since the model saving is fully done by PAI. We directly use the columns in SELECT
-    # statement for prediction, error will be reported by PAI job if the columns not match.
+    # NOTE(typhoonzero): for PAI random forests predicting, we can not load
+    # the TrainStmt since the model saving is fully done by PAI. We directly
+    # use the columns in SELECT statement for prediction, error will be
+    # reported by PAI job if the columns not match.
     if not label_column:
         raise SQLFlowDiagnostic("must specify WITH label_column when using "
                                 "pai random forest to explain models")
@@ -752,11 +758,12 @@ def get_explain_random_forests_cmd(datasource, model_name, data_table,
     db.execute(conn, "DROP TABLE IF EXISTS %s;" % result_table)
     schema = db.get_table_schema(conn, data_table)
     fields = [f[0] for f in schema if f[0] != label_column]
-    return (
-        '''pai -name feature_importance -project algo_public '''
-        '''-DmodelName="%s" -DinputTableName="%s"  '''
-        '''-DoutputTableName="%s" -DlabelColName="%s" -DfeatureColNames="%s" '''
-    ) % (model_name, data_table, result_table, label_column, ",".join(fields))
+    return ('''pai -name feature_importance -project algo_public '''
+            '''-DmodelName="%s" -DinputTableName="%s"  '''
+            '''-DoutputTableName="%s" -DlabelColName="%s" '''
+            '''-DfeatureColNames="%s" ''') % (model_name, data_table,
+                                              result_table, label_column,
+                                              ",".join(fields))
 
 
 def submit_pai_explain(datasource, select, result_table, model_name,
@@ -774,13 +781,13 @@ def submit_pai_explain(datasource, select, result_table, model_name,
     params = dict(locals())
 
     cwd = tempfile.mkdtemp(prefix="sqlflow", dir="/tmp")
-    # TODO(typhoonzero): Do **NOT** create tmp table when the select statement is like:
-    # "SELECT fields,... FROM table"
+    # TODO(typhoonzero): Do **NOT** create tmp table when the select statement
+    # is like: "SELECT fields,... FROM table"
     data_table = create_tmp_table_from_select(select, datasource)
     params["data_table"] = data_table
 
-    # format resultTable name to "db.table" to let the codegen form a submitting
-    # argument of format "odps://project/tables/table_name"
+    # format resultTable name to "db.table" to let the codegen form a
+    # submitting argument of format "odps://project/tables/table_name"
     project = get_project(datasource)
     if result_table.count(".") == 0:
         result_table = "%s.%s" % (project, result_table)
 
@@ -16,8 +16,7 @@
 from unittest import TestCase
 
 import runtime.testing as testing
-import runtime.xgboost as xgboost_extended
-import tensorflow as tf
+import runtime.xgboost as xgboost_extended  # noqa: F401
 from runtime.pai import submitter
 from runtime.pai.cluster_conf import get_cluster_config
 
@@ -28,8 +27,10 @@ def test_get_oss_model_url(self):
         self.assertEqual("oss://sqlflow-models/user_a/model", url)
 
     def test_get_datasource_dsn(self):
-        ds = "odps://access_id:access_key@service.com/api?curr_project=test_ci&scheme=http"
-        expected_dsn = "access_id:access_key@service.com/api?curr_project=test_ci&scheme=http"
+        ds = "odps://access_id:access_key@service.com/api?" \
+             "curr_project=test_ci&scheme=http"
+        expected_dsn = "access_id:access_key@service.com/api?" \
+                       "curr_project=test_ci&scheme=http"
         dsn = submitter.get_datasource_dsn(ds)
         self.assertEqual(expected_dsn, dsn)
         project = "test_ci"
@@ -38,17 +39,22 @@ def test_get_datasource_dsn(self):
     def test_get_pai_tf_cmd(self):
         conf = get_cluster_config({})
         os.environ[
-            "SQLFLOW_OSS_CHECKPOINT_CONFIG"] = '''{"arn":"arn", "host":"host"}'''
+            "SQLFLOW_OSS_CHECKPOINT_CONFIG"] = '{"arn":"arn", "host":"host"}'
         cmd = submitter.get_pai_tf_cmd(
             conf, "job.tar.gz", "params.txt", "entry.py", "my_dnn_model",
             "user1/my_dnn_model", "test_project.input_table",
             "test_project.val_table", "test_project.res_table", "test_project")
         expected = (
-            "pai -name tensorflow1150 -project algo_public_dev -DmaxHungTimeBeforeGCInSeconds=0 "
-            "-DjobName=sqlflow_my_dnn_model -Dtags=dnn -Dscript=job.tar.gz -DentryFile=entry.py "
-            "-Dtables=odps://test_project/tables/input_table,odps://test_project/tables/val_table "
-            "-Doutputs=odps://test_project/tables/res_table -DhyperParameters='params.txt' "
-            "-DcheckpointDir='oss://sqlflow-models/user1/my_dnn_model/?role_arn=arn/pai2osstestproject&host=host' "
+            "pai -name tensorflow1150 -project algo_public_dev "
+            "-DmaxHungTimeBeforeGCInSeconds=0 "
+            "-DjobName=sqlflow_my_dnn_model -Dtags=dnn -Dscript=job.tar.gz "
+            "-DentryFile=entry.py "
+            "-Dtables=odps://test_project/tables/input_table,"
+            "odps://test_project/tables/val_table "
+            "-Doutputs=odps://test_project/tables/res_table "
+            "-DhyperParameters='params.txt' "
+            "-DcheckpointDir='oss://sqlflow-models/user1/my_dnn_model/?"
+            "role_arn=arn/pai2osstestproject&host=host' "
             "-DgpuRequired='0'")
         self.assertEqual(expected, cmd)
 
@@ -58,13 +64,18 @@ def test_get_pai_tf_cmd(self):
             "user1/my_dnn_model", "test_project.input_table",
             "test_project.val_table", "test_project.res_table", "test_project")
         expected = (
-            "pai -name tensorflow1150 -project algo_public_dev -DmaxHungTimeBeforeGCInSeconds=0 "
-            "-DjobName=sqlflow_my_dnn_model -Dtags=dnn -Dscript=job.tar.gz -DentryFile=entry.py "
-            "-Dtables=odps://test_project/tables/input_table,odps://test_project/tables/val_table "
-            "-Doutputs=odps://test_project/tables/res_table -DhyperParameters='params.txt' "
-            "-DcheckpointDir='oss://sqlflow-models/user1/my_dnn_model/?role_arn=arn/pai2osstestproject&host=host' "
-            r'''-Dcluster="{\"ps\": {\"count\": 1, \"cpu\": 200, \"gpu\": 0}, \"worker\": {\"count\": 5, \"cpu\": 400, \"gpu\": 0}}"'''
-        )
+            "pai -name tensorflow1150 -project algo_public_dev "
+            "-DmaxHungTimeBeforeGCInSeconds=0 "
+            "-DjobName=sqlflow_my_dnn_model -Dtags=dnn -Dscript=job.tar.gz "
+            "-DentryFile=entry.py "
+            "-Dtables=odps://test_project/tables/input_table,"
+            "odps://test_project/tables/val_table "
+            "-Doutputs=odps://test_project/tables/res_table "
+            "-DhyperParameters='params.txt' "
+            "-DcheckpointDir='oss://sqlflow-models/user1/my_dnn_model/?"
+            "role_arn=arn/pai2osstestproject&host=host' "
+            r'''-Dcluster="{\"ps\": {\"count\": 1, \"cpu\": 200, \"gpu\": 0}'''
+            r''', \"worker\": {\"count\": 5, \"cpu\": 400, \"gpu\": 0}}"''')
         self.assertEqual(expected, cmd)
         del os.environ["SQLFLOW_OSS_CHECKPOINT_CONFIG"]
 
@@ -136,12 +147,14 @@ def test_submit_pai_train_task(self):
         model_params["hidden_units"] = [10, 20]
         model_params["n_classes"] = 3
 
-        # feature_columns_code will be used to save the training informations together
-        # with the saved model.
-        feature_columns_code = """{"feature_columns": [tf.feature_column.numeric_column("sepal_length", shape=[1]),
-        tf.feature_column.numeric_column("sepal_width", shape=[1]),
-        tf.feature_column.numeric_column("petal_length", shape=[1]),
-        tf.feature_column.numeric_column("petal_width", shape=[1])]}"""
+        # feature_columns_code will be used to save the training information
+        # together with the saved model.
+        feature_columns_code = """{"feature_columns": [
+            tf.feature_column.numeric_column("sepal_length", shape=[1]),
+            tf.feature_column.numeric_column("sepal_width", shape=[1]),
+            tf.feature_column.numeric_column("petal_length", shape=[1]),
+            tf.feature_column.numeric_column("petal_width", shape=[1]),
+        ]}"""
         feature_columns = eval(feature_columns_code)
 
         submitter.submit_pai_train(
@@ -172,12 +185,12 @@ def test_submit_pai_train_task(self):
             is_pai=True,
             feature_columns_code=feature_columns_code,
             model_repo_image="",
-            original_sql=
-            '''SELECT * FROM alifin_jtest_dev.sqlflow_test_iris_train
-    TO TRAIN DNNClassifier
-    WITH model.n_classes = 3, model.hidden_units = [10, 20]
-    LABEL class
-    INTO e2etest_pai_dnn;''')
+            original_sql='''
+SELECT * FROM alifin_jtest_dev.sqlflow_test_iris_train
+TO TRAIN DNNClassifier
+WITH model.n_classes = 3, model.hidden_units = [10, 20]
+LABEL class
+INTO e2etest_pai_dnn;''')
 
     def test_submit_pai_predict_task(self):
         submitter.submit_pai_predict(