refine codes in python/runtime/xgboost by flake8 (#2768)

sneaxiy · web-flow · commit 17626562861a · 2020-07-29T19:42:56.000+08:00
diff --git a/python/runtime/xgboost/__init__.py b/python/runtime/xgboost/__init__.py
@@ -11,7 +11,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from runtime.xgboost import feature_column
+from runtime.xgboost import feature_column  # noqa: F401
 
 
 class DataTypeCollection(object):
diff --git a/python/runtime/xgboost/dataset.py b/python/runtime/xgboost/dataset.py
@@ -165,8 +165,8 @@ def dump_dmatrix(filename,
 
             f.write("\t".join(row_data) + "\n")
             row_id += 1
-            # batch_size == None meas use all data in generator
-            if batch_size == None:
+            # batch_size == None means use all data in generator
+            if batch_size is None:
                 continue
             if row_id >= batch_size:
                 break
@@ -224,8 +224,9 @@ def get_pai_table_slice_count(table, nworkers, batch_size):
 
     row_cnt = db.get_pai_table_row_num(table)
 
-    assert row_cnt >= nworkers, "Data number {} should not less than worker number {}".format(
-        row_cnt, nworkers)
+    assert row_cnt >= nworkers, "Data number {} should not " \
+                                "less than worker number {}"\
+        .format(row_cnt, nworkers)
 
     slice_num_per_worker = max(int(row_cnt / (nworkers * batch_size)), 1)
     slice_count = slice_num_per_worker * nworkers
@@ -279,7 +280,8 @@ def thread_worker(slice_id):
                 raw_data_dir
             ]))
 
-        assert p.returncode == 0, "The subprocess raises error when reading data"
+        assert p.returncode == 0, \
+            "The subprocess raises error when reading data"
         complete_queue.put(slice_id)
 
     slice_id = rank
diff --git a/python/runtime/xgboost/evaluate.py b/python/runtime/xgboost/evaluate.py
@@ -12,21 +12,33 @@
 # limitations under the License.
 
 import numpy as np
-import sklearn
+import sklearn.metrics
 import xgboost as xgb
 from runtime import db
 from runtime.xgboost.dataset import xgb_dataset
-# yapf: disable
-from sklearn.metrics import (accuracy_score, average_precision_score,
-                             balanced_accuracy_score, brier_score_loss,
-                             cohen_kappa_score, explained_variance_score,
-                             f1_score, fbeta_score, hamming_loss, hinge_loss,
-                             log_loss, mean_absolute_error, mean_squared_error,
-                             mean_squared_log_error, median_absolute_error,
-                             precision_score, r2_score, recall_score,
-                             roc_auc_score, zero_one_loss)
 
-# yapf: enable
+SKLEARN_METRICS = [
+    'accuracy_score',
+    'average_precision_score',
+    'balanced_accuracy_score',
+    'brier_score_loss',
+    'cohen_kappa_score',
+    'explained_variance_score',
+    'f1_score',
+    'fbeta_score',
+    'hamming_loss',
+    'hinge_loss',
+    'log_loss',
+    'mean_absolute_error',
+    'mean_squared_error',
+    'mean_squared_log_error',
+    'median_absolute_error',
+    'precision_score',
+    'r2_score',
+    'recall_score',
+    'roc_auc_score',
+    'zero_one_loss',
+]
 
 DEFAULT_PREDICT_BATCH_SIZE = 10000
 
@@ -95,8 +107,9 @@ def evaluate_and_store_result(bst, dpred, feature_file_id, validation_metrics,
             # using the original prediction result of predict API by default
             pass
     else:
-        # prediction output with multi-class job has two dimensions, this is a temporary
-        # way, can remove this else branch when we can load the model meta not only on PAI submitter.
+        # prediction output with multi-class job has two dimensions, this
+        # is a temporary way, can remove this else branch when we can load
+        # the model meta not only on PAI submitter.
         if len(preds.shape) == 2:
             preds = np.argmax(np.array(preds), axis=1)
 
@@ -121,7 +134,9 @@ def evaluate_and_store_result(bst, dpred, feature_file_id, validation_metrics,
 
     evaluate_results = dict()
     for metric_name in validation_metrics:
-        metric_func = eval(metric_name)
+        if metric_name not in SKLEARN_METRICS:
+            raise ValueError("unsupported metric: %s" % metric_name)
+        metric_func = getattr(sklearn.metrics, metric_name)
         metric_value = metric_func(y_test, preds)
         evaluate_results[metric_name] = metric_value
 
diff --git a/python/runtime/xgboost/explain.py b/python/runtime/xgboost/explain.py
@@ -184,8 +184,10 @@ def explain(datasource,
 
     if result_table != "":
         if is_pai:
-            # TODO(typhoonzero): the shape of shap_values is (3, num_samples, num_features)
-            # use the first dimension here, should find out how to use the other two.
+            # TODO(typhoonzero): the shape of shap_values is
+            # (3, num_samples, num_features), use the first
+            # dimension here, should find out how to use
+            # the other two.
             write_shap_values(shap_values[0], "pai_maxcompute", None,
                               result_table, feature_column_names,
                               hdfs_namenode_addr, hive_location, hdfs_user,
diff --git a/python/runtime/xgboost/feature_column.py b/python/runtime/xgboost/feature_column.py
@@ -32,7 +32,7 @@
 if six.PY2:
 
     def hashing(x):
-        return long(hashlib.sha1(x).hexdigest(), 16)
+        return long(hashlib.sha1(x).hexdigest(), 16)  # noqa: F821
 else:
 
     def hashing(x):
@@ -139,9 +139,9 @@ def elementwise_transform_fn(x):
                 if self.default_value is not None:
                     return self.default_value
                 else:
-                    raise ValueError(
-                        'The categorical value of column {} out of range [0, {})'
-                        .format(self.key, self.num_buckets))
+                    raise ValueError('The categorical value of column {} '
+                                     'out of range [0, {})'.format(
+                                         self.key, self.num_buckets))
 
             if isinstance(slot_value, np.ndarray):
                 output = elementwise_transform(
@@ -174,7 +174,7 @@ def num_classes(self):
         return len(self.vocabulary_list)
 
     def __call__(self, inputs):
-        fn = lambda x: self.vocabulary_list.index(x)
+        fn = lambda x: self.vocabulary_list.index(x)  # noqa: E731
 
         def transform_fn(slot_value):
             if isinstance(slot_value, np.ndarray):
@@ -208,7 +208,7 @@ def num_classes(self):
         return self.hash_bucket_size
 
     def __call__(self, inputs):
-        fn = lambda x: hashing(x) % self.hash_bucket_size
+        fn = lambda x: hashing(x) % self.hash_bucket_size  # noqa: E731
 
         def transform_fn(slot_value):
             if isinstance(slot_value, np.ndarray):
@@ -230,7 +230,9 @@ def categorical_column_with_hash_bucket(key, hash_bucket_size, dtype='string'):
 class IndicatorColumnTransformer(BaseColumnTransformer):
     def __init__(self, categorical_column):
         assert isinstance(categorical_column, CategoricalColumnTransformer), \
-            "categorical_column must be type of CategoricalColumnTransformer but got {}".format(type(categorical_column))
+            "categorical_column must be type of " \
+            "CategoricalColumnTransformer but got {}".format(
+                type(categorical_column))
         self.categorical_column = categorical_column
 
     def _set_feature_column_names(self, names):
diff --git a/python/runtime/xgboost/predict.py b/python/runtime/xgboost/predict.py
@@ -86,8 +86,10 @@ def predict_and_store_result(bst, dpred, feature_file_id, model_params,
                              hdfs_user, hdfs_pass):
     preds = bst.predict(dpred)
 
-    #TODO(yancey1989): should save train_params and model_params not only on PAI submitter
-    #TODO(yancey1989): output the original result for various objective function.
+    # TODO(yancey1989): should save train_params and model_params
+    # not only on PAI submitter
+    # TODO(yancey1989): output the original result for various
+    # objective function.
     if model_params:
         obj = model_params["objective"]
         if obj.startswith("binary:"):
@@ -98,8 +100,9 @@ def predict_and_store_result(bst, dpred, feature_file_id, model_params,
             # using the original prediction result of predict API by default
             pass
     else:
-        # prediction output with multi-class job has two dimensions, this is a temporary
-        # way, can remove this else branch when we can load the model meta not only on PAI submitter.
+        # prediction output with multi-class job has two dimensions, this
+        # is a temporary way, can remove this else branch when we can load
+        # the model meta not only on PAI submitter.
         if len(preds.shape) == 2:
             preds = np.argmax(np.array(preds), axis=1)
 
@@ -134,12 +137,12 @@ def predict_and_store_result(bst, dpred, feature_file_id, model_params,
                                hive_location=hive_location,
                                hdfs_user=hdfs_user,
                                hdfs_pass=hdfs_pass) as w:
-        import sys
         while True:
             line = feature_file_read.readline()
             if not line:
                 break
-            # FIXME(typhoonzero): how to output columns that are not used as features, like ids?
+            # FIXME(typhoonzero): how to output columns that are not used
+            # as features, like ids?
             row = [
                 item for i, item in enumerate(line.strip().split("/"))
                 if i != train_label_index
diff --git a/python/runtime/xgboost/tracker.py b/python/runtime/xgboost/tracker.py
@@ -24,7 +24,8 @@
  - help nodes to establish links with each other
 Tianqi Chen
 """
-# pylint: disable=invalid-name, missing-docstring, too-many-arguments, too-many-locals
+# pylint: disable=invalid-name, missing-docstring
+# pylint: disable=too-many-arguments, too-many-locals
 # pylint: disable=too-many-branches, too-many-statements
 from __future__ import absolute_import
 
@@ -436,9 +437,8 @@ def get_host_ip(hostIP=None):
         try:
             hostIP = socket.gethostbyname(socket.getfqdn())
         except gaierror:
-            logging.warn(
-                'gethostbyname(socket.getfqdn()) failed... trying on hostname()'
-            )
+            logging.warn('gethostbyname(socket.getfqdn()) failed... '
+                         'trying on hostname()')
             hostIP = socket.gethostbyname(socket.gethostname())
         if hostIP.startswith("127."):
             s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
diff --git a/python/runtime/xgboost/train.py b/python/runtime/xgboost/train.py
@@ -12,7 +12,6 @@
 # limitations under the License.
 
 import json
-import os
 import sys
 
 import runtime.pai.pai_distributed as pai_dist
@@ -200,8 +199,9 @@ def save_model_to_local_file(booster, model_params, meta, filename):
     from sklearn2pmml import PMMLPipeline, sklearn2pmml
     try:
         from xgboost.compat import XGBoostLabelEncoder
-    except:
-        # xgboost==0.82.0 does not have XGBoostLabelEncoder in xgboost.compat.py
+    except:  # noqa: E722
+        # xgboost==0.82.0 does not have XGBoostLabelEncoder
+        # in xgboost.compat.py
         from xgboost.sklearn import XGBLabelEncoder as XGBoostLabelEncoder
 
     objective = model_params.get("objective")
@@ -212,10 +212,11 @@ def save_model_to_local_file(booster, model_params, meta, filename):
             num_class = 2
         else:
             num_class = model_params.get("num_class")
-            assert num_class is not None and num_class > 0, "num_class should not be None"
+            assert num_class is not None and num_class > 0, \
+                "num_class should not be None"
 
-        # To fake a trained XGBClassifier, there must be "_le", "classes_", inside
-        # XGBClassifier. See here:
+        # To fake a trained XGBClassifier, there must be "_le", "classes_",
+        # inside XGBClassifier. See here:
         # https://github.com/dmlc/xgboost/blob/d19cec70f1b40ea1e1a35101ca22e46dd4e4eecd/python-package/xgboost/sklearn.py#L356
         model = xgb.XGBClassifier()
         label_encoder = XGBoostLabelEncoder()