Support explaining DNNs using SHAP (#1694)

shendiaomo · web-flow · commit 9132741089d7 · 2020-01-15T00:15:39.000+08:00
* Support explaining DNNs using SHAP

* Reformatting to satisfy isort and yapf

* Fix CI

* Fix CI

* Add unit test

* Remove unnecessary Sprintf
diff --git a/cmd/sqlflowserver/main_test.go b/cmd/sqlflowserver/main_test.go
@@ -293,6 +293,7 @@ func TestEnd2EndMySQL(t *testing.T) {
 	t.Run("CasePredictXGBoostRegression", CasePredictXGBoostRegression)
 	t.Run("CaseTrainDeepWideModel", CaseTrainDeepWideModel)
 	t.Run("CaseTrainDeepWideModelOptimizer", CaseTrainDeepWideModelOptimizer)
+	t.Run("CaseTrainAdaNetAndExplain", CaseTrainAdaNetAndExplain)
 
 	// Cases using feature derivation
 	t.Run("CaseTrainTextClassificationIR", CaseTrainTextClassificationIR)
@@ -378,7 +379,7 @@ func TestEnd2EndHive(t *testing.T) {
 	t.Run("CaseTrainSQLWithMetrics", CaseTrainSQLWithMetrics)
 	t.Run("CaseTrainRegression", CaseTrainRegression)
 	t.Run("CaseTrainCustomModel", CaseTrainCustomModel)
-	t.Run("CaseTrainAdaNet", CaseTrainAdaNet)
+	t.Run("CaseTrainAdaNetAndExplain", CaseTrainAdaNetAndExplain)
 	t.Run("CaseTrainOptimizer", CaseTrainOptimizer)
 	t.Run("CaseTrainDeepWideModel", CaseTrainDeepWideModel)
 	t.Run("CaseTrainDeepWideModelOptimizer", CaseTrainDeepWideModelOptimizer)
@@ -939,16 +940,17 @@ INTO sqlflow_models.my_dnn_linear_model;`
 	}
 }
 
-func CaseTrainAdaNet(t *testing.T) {
+func CaseTrainAdaNetAndExplain(t *testing.T) {
 	a := assert.New(t)
 	trainSQL := `SELECT * FROM iris.train
-TO TRAIN sqlflow_models.AutoClassifier WITH model.n_classes = 3
-LABEL class
-INTO sqlflow_models.my_adanet_model;`
+TO TRAIN sqlflow_models.AutoClassifier WITH model.n_classes = 3 LABEL class INTO sqlflow_models.my_adanet_model;`
 	_, _, err := connectAndRunSQL(trainSQL)
 	if err != nil {
 		a.Fail("run trainSQL error: %v", err)
 	}
+	explainSQL := `SELECT * FROM iris.test LIMIT 10 TO EXPLAIN sqlflow_models.my_adanet_model;`
+	_, _, err = connectAndRunSQL(explainSQL)
+	a.NoError(err)
 }
 
 func CaseTrainDeepWideModelOptimizer(t *testing.T) {
diff --git a/pkg/sql/codegen/tensorflow/codegen.go b/pkg/sql/codegen/tensorflow/codegen.go
@@ -432,10 +432,6 @@ func Pred(predStmt *ir.PredictStmt, session *pb.Session) (string, error) {
 
 // Explain generates a Python program to explain a trained model.
 func Explain(stmt *ir.ExplainStmt, session *pb.Session) (string, error) {
-	if !strings.HasPrefix(stmt.TrainStmt.Estimator, "BoostedTrees") {
-		return "", fmt.Errorf("unsupported model %s", stmt.TrainStmt.Estimator)
-	}
-
 	modelParams, featureColumnsCode, fieldDescs, err := restoreModel(stmt.TrainStmt)
 	if err != nil {
 		return "", err
diff --git a/python/sqlflow_submitter/tensorflow/explain.py b/python/sqlflow_submitter/tensorflow/explain.py
@@ -18,6 +18,7 @@
 import numpy as np
 import pandas as pd
 import seaborn as sns
+import shap
 import tensorflow as tf
 from sqlflow_submitter import explainer
 from sqlflow_submitter.db import buffered_db_writer, connect_with_data_source
@@ -83,8 +84,28 @@ def _input_fn():
 
     model_params.update(feature_columns)
     estimator = estimator_cls(**model_params)
-    result = estimator.experimental_predict_with_explanations(
-        lambda: _input_fn())
+    if estimator_cls in (tf.estimator.BoostedTreesClassifier,
+                         tf.estimator.BoostedTreesRegressor):
+        explain_boosted_trees(datasource, estimator, _input_fn, plot_type,
+                              result_table, feature_column_names,
+                              hdfs_namenode_addr, hive_location, hdfs_user,
+                              hdfs_pass)
+    else:
+        shap_dataset = pd.DataFrame(columns=feature_column_names)
+        for i, (features, label) in enumerate(_input_fn()):
+            shap_dataset.loc[i] = [
+                item.numpy()[0][0] for item in features.values()
+            ]
+        explain_dnns(datasource, estimator, shap_dataset, plot_type,
+                     result_table, feature_column_names, hdfs_namenode_addr,
+                     hive_location, hdfs_user, hdfs_pass)
+
+
+def explain_boosted_trees(datasource, estimator, input_fn, plot_type,
+                          result_table, feature_column_names,
+                          hdfs_namenode_addr, hive_location, hdfs_user,
+                          hdfs_pass):
+    result = estimator.experimental_predict_with_explanations(input_fn)
     pred_dicts = list(result)
     df_dfc = pd.DataFrame([pred['dfc'] for pred in pred_dicts])
     dfc_mean = df_dfc.abs().mean()
@@ -98,6 +119,23 @@ def _input_fn():
     explainer.plot_and_save(lambda: eval(plot_type)(df_dfc))
 
 
+def explain_dnns(datasource, estimator, shap_dataset, plot_type, result_table,
+                 feature_column_names, hdfs_namenode_addr, hive_location,
+                 hdfs_user, hdfs_pass):
+    def predict(d):
+        def input_fn():
+            return tf.data.Dataset.from_tensor_slices(
+                dict(pd.DataFrame(d, columns=shap_dataset.columns))).batch(1)
+
+        return np.array(
+            [p['probabilities'][0] for p in estimator.predict(input_fn)])
+
+    shap_values = shap.KernelExplainer(predict,
+                                       shap_dataset).shap_values(shap_dataset)
+    explainer.plot_and_save(lambda: shap.summary_plot(
+        shap_values, shap_dataset, show=False, plot_type=plot_type))
+
+
 def create_explain_result_table(conn, result_table):
     column_clause = ""
     if conn.driver == "mysql":
diff --git a/python/sqlflow_submitter/xgboost/explain.py b/python/sqlflow_submitter/xgboost/explain.py
@@ -11,12 +11,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import json
-import sys
-
-import matplotlib
-import matplotlib.pyplot as plt
-import numpy as np
 import pandas as pd
 import shap
 import xgboost as xgb