Merge branch 'master' of github.com:medoidai/skrobot

medoidai · Jan 10, 2021 · c935c8f · c935c8f
2 parents b93e314 + 15e7997
commit c935c8f
Show file tree

Hide file tree

Showing 56 changed files with 60 additions and 20 deletions.
diff --git a/docs/source/how_do_i_use_it.rst b/docs/source/how_do_i_use_it.rst
@@ -153,7 +153,7 @@ The following example has generated the following `results <https://github.com/m
                                             random_seed=random_seed))
 
     # Run Prediction Task
-    new_data_set = ft.calculate_feature_matrix(feature_defs, entities={ "passengers" : (new_raw_data_set, id_column, None, variable_types) }, relationships=())
+    new_data_set = ft.calculate_feature_matrix(feature_defs, entities={ "passengers" : (new_raw_data_set, id_column, None, variable_types) })
 
     new_data_set.reset_index(inplace=True)
 
@@ -290,4 +290,4 @@ The following example has generated the following `results <https://github.com/m
 
    print(train_results['estimator'])
 
-   print(predictions)
+   print(predictions)
diff --git a/examples/example_titanic_pipeline_with_model_based_feature_selection.py b/examples/example_titanic_pipeline_with_model_based_feature_selection.py
@@ -141,7 +141,7 @@
                                          random_seed=random_seed))
 
 # Run Prediction Task
-new_data_set = ft.calculate_feature_matrix(feature_defs, entities={ "passengers" : (new_raw_data_set, id_column, None, variable_types) }, relationships=())
+new_data_set = ft.calculate_feature_matrix(feature_defs, entities={ "passengers" : (new_raw_data_set, id_column, None, variable_types) })
 
 new_data_set.reset_index(inplace=True)
 
@@ -171,4 +171,4 @@
 
 print(train_results['estimator'])
 
-print(predictions)
+print(predictions)
diff --git a/...29T00-53-45-example-customer-transactional-deep-feature-synthesis/feature_definitions.txt b/...29T00-53-45-example-customer-transactional-deep-feature-synthesis/feature_definitions.txt
diff --git a/...e-synthesis/feature_graphs/session_id.png → ...ep-feature-synthesis/feature_graphs/0.png b/...e-synthesis/feature_graphs/session_id.png → ...ep-feature-synthesis/feature_graphs/0.png
diff --git a/...e-synthesis/feature_graphs/product_id.png → ...ep-feature-synthesis/feature_graphs/1.png b/...e-synthesis/feature_graphs/product_id.png → ...ep-feature-synthesis/feature_graphs/1.png
diff --git a/...essions.MODE(transactions.product_id).png → ...p-feature-synthesis/feature_graphs/10.png b/...essions.MODE(transactions.product_id).png → ...p-feature-synthesis/feature_graphs/10.png
diff --git a/...s.NUM_UNIQUE(transactions.product_id).png → ...p-feature-synthesis/feature_graphs/11.png b/...s.NUM_UNIQUE(transactions.product_id).png → ...p-feature-synthesis/feature_graphs/11.png
diff --git a/...re_graphs/sessions.DAY(session_start).png → ...p-feature-synthesis/feature_graphs/12.png b/...re_graphs/sessions.DAY(session_start).png → ...p-feature-synthesis/feature_graphs/12.png
diff --git a/..._graphs/sessions.MONTH(session_start).png → ...p-feature-synthesis/feature_graphs/13.png b/..._graphs/sessions.MONTH(session_start).png → ...p-feature-synthesis/feature_graphs/13.png
diff --git a/...raphs/sessions.WEEKDAY(session_start).png → ...p-feature-synthesis/feature_graphs/14.png b/...raphs/sessions.WEEKDAY(session_start).png → ...p-feature-synthesis/feature_graphs/14.png
diff --git a/...e_graphs/sessions.YEAR(session_start).png → ...p-feature-synthesis/feature_graphs/15.png b/...e_graphs/sessions.YEAR(session_start).png → ...p-feature-synthesis/feature_graphs/15.png
diff --git a/...re_graphs/sessions.customers.zip_code.png → ...p-feature-synthesis/feature_graphs/16.png b/...re_graphs/sessions.customers.zip_code.png → ...p-feature-synthesis/feature_graphs/16.png
diff --git a/...e_graphs/products.COUNT(transactions).png → ...p-feature-synthesis/feature_graphs/17.png b/...e_graphs/products.COUNT(transactions).png → ...p-feature-synthesis/feature_graphs/17.png
diff --git a/...roducts.MODE(transactions.session_id).png → ...p-feature-synthesis/feature_graphs/18.png b/...roducts.MODE(transactions.session_id).png → ...p-feature-synthesis/feature_graphs/18.png
diff --git a/...s.NUM_UNIQUE(transactions.session_id).png → ...p-feature-synthesis/feature_graphs/19.png b/...s.NUM_UNIQUE(transactions.session_id).png → ...p-feature-synthesis/feature_graphs/19.png
diff --git a/.../feature_graphs/DAY(transaction_time).png → ...ep-feature-synthesis/feature_graphs/2.png b/.../feature_graphs/DAY(transaction_time).png → ...ep-feature-synthesis/feature_graphs/2.png
diff --git a/...eature_graphs/MONTH(transaction_time).png → ...ep-feature-synthesis/feature_graphs/3.png b/...eature_graphs/MONTH(transaction_time).png → ...ep-feature-synthesis/feature_graphs/3.png
diff --git a/...ture_graphs/WEEKDAY(transaction_time).png → ...ep-feature-synthesis/feature_graphs/4.png b/...ture_graphs/WEEKDAY(transaction_time).png → ...ep-feature-synthesis/feature_graphs/4.png
diff --git a/...feature_graphs/YEAR(transaction_time).png → ...ep-feature-synthesis/feature_graphs/5.png b/...feature_graphs/YEAR(transaction_time).png → ...ep-feature-synthesis/feature_graphs/5.png
diff --git a/...s/feature_graphs/sessions.customer_id.png → ...ep-feature-synthesis/feature_graphs/6.png b/...s/feature_graphs/sessions.customer_id.png → ...ep-feature-synthesis/feature_graphs/6.png
diff --git a/...thesis/feature_graphs/sessions.device.png → ...ep-feature-synthesis/feature_graphs/7.png b/...thesis/feature_graphs/sessions.device.png → ...ep-feature-synthesis/feature_graphs/7.png
diff --git a/...nthesis/feature_graphs/products.brand.png → ...ep-feature-synthesis/feature_graphs/8.png b/...nthesis/feature_graphs/products.brand.png → ...ep-feature-synthesis/feature_graphs/8.png
diff --git a/...e_graphs/sessions.COUNT(transactions).png → ...ep-feature-synthesis/feature_graphs/9.png b/...e_graphs/sessions.COUNT(transactions).png → ...ep-feature-synthesis/feature_graphs/9.png
diff --git a/...9T00-53-45-example-customer-transactional-deep-feature-synthesis/feature_information.html b/...9T00-53-45-example-customer-transactional-deep-feature-synthesis/feature_information.html
@@ -1,108 +1,129 @@
 <table border="1" class="dataframe">
   <thead>
     <tr style="text-align: right;">
+      <th>feature_id</th>
       <th>feature_name</th>
       <th>feature_type</th>
       <th>feature_description</th>
     </tr>
   </thead>
   <tbody>
     <tr>
+      <td>0</td>
       <td>session_id</td>
       <td>int64</td>
       <td>The "session_id".</td>
     </tr>
     <tr>
+      <td>1</td>
       <td>product_id</td>
       <td>category</td>
       <td>The "product_id".</td>
     </tr>
     <tr>
+      <td>2</td>
       <td>DAY(transaction_time)</td>
       <td>int64</td>
       <td>The day of the month of the "transaction_time".</td>
     </tr>
     <tr>
+      <td>3</td>
       <td>MONTH(transaction_time)</td>
       <td>int64</td>
       <td>The month of the "transaction_time".</td>
     </tr>
     <tr>
+      <td>4</td>
       <td>WEEKDAY(transaction_time)</td>
       <td>int64</td>
       <td>The day of the week of the "transaction_time".</td>
     </tr>
     <tr>
+      <td>5</td>
       <td>YEAR(transaction_time)</td>
       <td>int64</td>
       <td>The year of the "transaction_time".</td>
     </tr>
     <tr>
+      <td>6</td>
       <td>sessions.customer_id</td>
       <td>int64</td>
       <td>The "customer_id" for the instance of "sessions" associated with this instance of "transactions".</td>
     </tr>
     <tr>
+      <td>7</td>
       <td>sessions.device</td>
       <td>object</td>
       <td>The "device" for the instance of "sessions" associated with this instance of "transactions".</td>
     </tr>
     <tr>
+      <td>8</td>
       <td>products.brand</td>
       <td>object</td>
       <td>The "brand" for the instance of "products" associated with this instance of "transactions".</td>
     </tr>
     <tr>
+      <td>9</td>
       <td>sessions.COUNT(transactions)</td>
       <td>int64</td>
       <td>The number of all instances of "transactions" for each "session_id" in "sessions" for the instance of "sessions" associated with this instance of "transactions".</td>
     </tr>
     <tr>
+      <td>10</td>
       <td>sessions.MODE(transactions.product_id)</td>
       <td>int64</td>
       <td>The most frequently occurring value of the "product_id" of all instances of "transactions" for each "session_id" in "sessions" for the instance of "sessions" associated with this instance of "transactions".</td>
     </tr>
     <tr>
+      <td>11</td>
       <td>sessions.NUM_UNIQUE(transactions.product_id)</td>
       <td>int64</td>
       <td>The number of unique elements in the "product_id" of all instances of "transactions" for each "session_id" in "sessions" for the instance of "sessions" associated with this instance of "transactions".</td>
     </tr>
     <tr>
+      <td>12</td>
       <td>sessions.DAY(session_start)</td>
       <td>int64</td>
       <td>The day of the month of the "session_start" for the instance of "sessions" associated with this instance of "transactions".</td>
     </tr>
     <tr>
+      <td>13</td>
       <td>sessions.MONTH(session_start)</td>
       <td>int64</td>
       <td>The month of the "session_start" for the instance of "sessions" associated with this instance of "transactions".</td>
     </tr>
     <tr>
+      <td>14</td>
       <td>sessions.WEEKDAY(session_start)</td>
       <td>int64</td>
       <td>The day of the week of the "session_start" for the instance of "sessions" associated with this instance of "transactions".</td>
     </tr>
     <tr>
+      <td>15</td>
       <td>sessions.YEAR(session_start)</td>
       <td>int64</td>
       <td>The year of the "session_start" for the instance of "sessions" associated with this instance of "transactions".</td>
     </tr>
     <tr>
+      <td>16</td>
       <td>sessions.customers.zip_code</td>
       <td>object</td>
       <td>The "zip_code" for the instance of "customers" associated with the instance of "sessions" associated with this instance of "transactions".</td>
     </tr>
     <tr>
+      <td>17</td>
       <td>products.COUNT(transactions)</td>
       <td>int64</td>
       <td>The number of all instances of "transactions" for each "product_id" in "products" for the instance of "products" associated with this instance of "transactions".</td>
     </tr>
     <tr>
+      <td>18</td>
       <td>products.MODE(transactions.session_id)</td>
       <td>int64</td>
       <td>The most frequently occurring value of the "session_id" of all instances of "transactions" for each "product_id" in "products" for the instance of "products" associated with this instance of "transactions".</td>
     </tr>
     <tr>
+      <td>19</td>
       <td>products.NUM_UNIQUE(transactions.session_id)</td>
       <td>int64</td>
       <td>The number of unique elements in the "session_id" of all instances of "transactions" for each "product_id" in "products" for the instance of "products" associated with this instance of "transactions".</td>

diff --git a/...el-based-feature-selection/example_titanic_pipeline_with_model_based_feature_selection.py b/...el-based-feature-selection/example_titanic_pipeline_with_model_based_feature_selection.py
@@ -141,7 +141,7 @@
                                          random_seed=random_seed))
 
 # Run Prediction Task
-new_data_set = ft.calculate_feature_matrix(feature_defs, entities={ "passengers" : (new_raw_data_set, id_column, None, variable_types) }, relationships=())
+new_data_set = ft.calculate_feature_matrix(feature_defs, entities={ "passengers" : (new_raw_data_set, id_column, None, variable_types) })
 
 new_data_set.reset_index(inplace=True)
 
@@ -171,4 +171,4 @@
 
 print(train_results['estimator'])
 
-print(predictions)
+print(predictions)
diff --git a/...16-26-example-titanic-pipeline-with-model-based-feature-selection/feature_definitions.txt b/...16-26-example-titanic-pipeline-with-model-based-feature-selection/feature_definitions.txt
diff --git a/...-feature-selection/feature_graphs/Age.png → ...ed-feature-selection/feature_graphs/0.png b/...-feature-selection/feature_graphs/Age.png → ...ed-feature-selection/feature_graphs/0.png
diff --git a/...feature-selection/feature_graphs/Fare.png → ...ed-feature-selection/feature_graphs/1.png b/...feature-selection/feature_graphs/Fare.png → ...ed-feature-selection/feature_graphs/1.png
diff --git a/...selection/feature_graphs/Fare + Parch.png → ...d-feature-selection/feature_graphs/10.png b/...selection/feature_graphs/Fare + Parch.png → ...d-feature-selection/feature_graphs/10.png
diff --git a/...selection/feature_graphs/Fare + SibSp.png → ...d-feature-selection/feature_graphs/11.png b/...selection/feature_graphs/Fare + SibSp.png → ...d-feature-selection/feature_graphs/11.png
diff --git a/...ample-titanic-pipeline-with-model-based-feature-selection/feature_graphs/12.png b/...ample-titanic-pipeline-with-model-based-feature-selection/feature_graphs/12.png
diff --git a/...e-selection/feature_graphs/Age * Fare.png → ...d-feature-selection/feature_graphs/13.png b/...e-selection/feature_graphs/Age * Fare.png → ...d-feature-selection/feature_graphs/13.png
diff --git a/...ample-titanic-pipeline-with-model-based-feature-selection/feature_graphs/14.png b/...ample-titanic-pipeline-with-model-based-feature-selection/feature_graphs/14.png
diff --git a/...ample-titanic-pipeline-with-model-based-feature-selection/feature_graphs/15.png b/...ample-titanic-pipeline-with-model-based-feature-selection/feature_graphs/15.png
diff --git a/...selection/feature_graphs/Fare * Parch.png → ...d-feature-selection/feature_graphs/16.png b/...selection/feature_graphs/Fare * Parch.png → ...d-feature-selection/feature_graphs/16.png
diff --git a/...selection/feature_graphs/Fare * SibSp.png → ...d-feature-selection/feature_graphs/17.png b/...selection/feature_graphs/Fare * SibSp.png → ...d-feature-selection/feature_graphs/17.png
diff --git a/...ample-titanic-pipeline-with-model-based-feature-selection/feature_graphs/18.png b/...ample-titanic-pipeline-with-model-based-feature-selection/feature_graphs/18.png
diff --git a/...eature-selection/feature_graphs/SibSp.png → ...ed-feature-selection/feature_graphs/2.png b/...eature-selection/feature_graphs/SibSp.png → ...ed-feature-selection/feature_graphs/2.png
diff --git a/...eature-selection/feature_graphs/Parch.png → ...ed-feature-selection/feature_graphs/3.png b/...eature-selection/feature_graphs/Parch.png → ...ed-feature-selection/feature_graphs/3.png
diff --git a/...ure-selection/feature_graphs/Embarked.png → ...ed-feature-selection/feature_graphs/4.png b/...ure-selection/feature_graphs/Embarked.png → ...ed-feature-selection/feature_graphs/4.png
diff --git a/...-feature-selection/feature_graphs/Sex.png → ...ed-feature-selection/feature_graphs/5.png b/...-feature-selection/feature_graphs/Sex.png → ...ed-feature-selection/feature_graphs/5.png
diff --git a/...ature-selection/feature_graphs/Pclass.png → ...ed-feature-selection/feature_graphs/6.png b/...ature-selection/feature_graphs/Pclass.png → ...ed-feature-selection/feature_graphs/6.png
diff --git a/...e-selection/feature_graphs/Age + Fare.png → ...ed-feature-selection/feature_graphs/7.png b/...e-selection/feature_graphs/Age + Fare.png → ...ed-feature-selection/feature_graphs/7.png
diff --git a/...xample-titanic-pipeline-with-model-based-feature-selection/feature_graphs/8.png b/...xample-titanic-pipeline-with-model-based-feature-selection/feature_graphs/8.png
diff --git a/...xample-titanic-pipeline-with-model-based-feature-selection/feature_graphs/9.png b/...xample-titanic-pipeline-with-model-based-feature-selection/feature_graphs/9.png
diff --git a/...anic-pipeline-with-model-based-feature-selection/feature_graphs/Age * Parch.png b/...anic-pipeline-with-model-based-feature-selection/feature_graphs/Age * Parch.png
diff --git a/...anic-pipeline-with-model-based-feature-selection/feature_graphs/Age * SibSp.png b/...anic-pipeline-with-model-based-feature-selection/feature_graphs/Age * SibSp.png
diff --git a/...anic-pipeline-with-model-based-feature-selection/feature_graphs/Age + Parch.png b/...anic-pipeline-with-model-based-feature-selection/feature_graphs/Age + Parch.png
diff --git a/...anic-pipeline-with-model-based-feature-selection/feature_graphs/Age + SibSp.png b/...anic-pipeline-with-model-based-feature-selection/feature_graphs/Age + SibSp.png
diff --git a/...ic-pipeline-with-model-based-feature-selection/feature_graphs/Parch * SibSp.png b/...ic-pipeline-with-model-based-feature-selection/feature_graphs/Parch * SibSp.png
diff --git a/...ic-pipeline-with-model-based-feature-selection/feature_graphs/Parch + SibSp.png b/...ic-pipeline-with-model-based-feature-selection/feature_graphs/Parch + SibSp.png
diff --git a/...6-26-example-titanic-pipeline-with-model-based-feature-selection/feature_information.html b/...6-26-example-titanic-pipeline-with-model-based-feature-selection/feature_information.html
@@ -1,103 +1,123 @@
 <table border="1" class="dataframe">
   <thead>
     <tr style="text-align: right;">
+      <th>feature_id</th>
       <th>feature_name</th>
       <th>feature_type</th>
       <th>feature_description</th>
     </tr>
   </thead>
   <tbody>
     <tr>
+      <td>0</td>
       <td>Age</td>
       <td>float64</td>
       <td>The "Age".</td>
     </tr>
     <tr>
+      <td>1</td>
       <td>Fare</td>
       <td>float64</td>
       <td>The "Fare".</td>
     </tr>
     <tr>
+      <td>2</td>
       <td>SibSp</td>
       <td>int64</td>
       <td>The "SibSp".</td>
     </tr>
     <tr>
+      <td>3</td>
       <td>Parch</td>
       <td>int64</td>
       <td>The "Parch".</td>
     </tr>
     <tr>
+      <td>4</td>
       <td>Embarked</td>
       <td>object</td>
       <td>The "Embarked".</td>
     </tr>
     <tr>
+      <td>5</td>
       <td>Sex</td>
       <td>object</td>
       <td>The "Sex".</td>
     </tr>
     <tr>
+      <td>6</td>
       <td>Pclass</td>
       <td>int64</td>
       <td>The "Pclass".</td>
     </tr>
     <tr>
+      <td>7</td>
       <td>Age + Fare</td>
       <td>float64</td>
       <td>The sum of the "Age" and the "Fare".</td>
     </tr>
     <tr>
+      <td>8</td>
       <td>Age + Parch</td>
       <td>float64</td>
       <td>The sum of the "Age" and the "Parch".</td>
     </tr>
     <tr>
+      <td>9</td>
       <td>Age + SibSp</td>
       <td>float64</td>
       <td>The sum of the "Age" and the "SibSp".</td>
     </tr>
     <tr>
+      <td>10</td>
       <td>Fare + Parch</td>
       <td>float64</td>
       <td>The sum of the "Fare" and the "Parch".</td>
     </tr>
     <tr>
+      <td>11</td>
       <td>Fare + SibSp</td>
       <td>float64</td>
       <td>The sum of the "Fare" and the "SibSp".</td>
     </tr>
     <tr>
+      <td>12</td>
       <td>Parch + SibSp</td>
       <td>int64</td>
       <td>The sum of the "Parch" and the "SibSp".</td>
     </tr>
     <tr>
+      <td>13</td>
       <td>Age * Fare</td>
       <td>float64</td>
       <td>The product of the "Age" and the "Fare".</td>
     </tr>
     <tr>
+      <td>14</td>
       <td>Age * Parch</td>
       <td>float64</td>
       <td>The product of the "Age" and the "Parch".</td>
     </tr>
     <tr>
+      <td>15</td>
       <td>Age * SibSp</td>
       <td>float64</td>
       <td>The product of the "Age" and the "SibSp".</td>
     </tr>
     <tr>
+      <td>16</td>
       <td>Fare * Parch</td>
       <td>float64</td>
       <td>The product of the "Fare" and the "Parch".</td>
     </tr>
     <tr>
+      <td>17</td>
       <td>Fare * SibSp</td>
       <td>float64</td>
       <td>The product of the "Fare" and the "SibSp".</td>
     </tr>
     <tr>
+      <td>18</td>
       <td>Parch * SibSp</td>
       <td>int64</td>
       <td>The product of the "Parch" and the "SibSp".</td>

diff --git a/setup.py b/setup.py
@@ -19,7 +19,7 @@
      long_description_content_type='text/markdown',
      url="https://github.com/medoidai/skrobot",
      python_requires='>=3.6',
-     install_requires=['featuretools==0.22.0',
+     install_requires=['featuretools==0.23.0',
                        'joblib==1.0.0',
                        'matplotlib==3.3.3',
                        'numpy==1.19.4',
@@ -45,4 +45,4 @@
          "Topic :: Scientific/Engineering",
          "Topic :: Scientific/Engineering :: Artificial Intelligence",
          "Environment :: Console"]
-)
+)
diff --git a/skrobot/tasks/deep_feature_synthesis_task.py b/skrobot/tasks/deep_feature_synthesis_task.py
@@ -128,21 +128,20 @@ def run(self, output_directory):
 
     feature_defs = [ o for o in feature_defs if o.get_name() != self.label_column and o.get_name() not in label_related_columns_to_drop ]
 
-    if self.export_feature_graphs:
-      for feature_def in feature_defs:
-        ft.graph_feature(feature_def, to_file=os.path.join(output_directory, 'feature_graphs', f'{feature_def.get_name()}.png'), description=True)
+    features_df = pd.DataFrame({ 'feature_id' : range(len(feature_defs)), 'feature_def' : feature_defs })
+
+    features_df['feature_name'] = features_df['feature_def'].apply(lambda o: o.get_name())
+    features_df['feature_type'] = features_df['feature_def'].apply(lambda o: synthesized_dataset.dtypes[o.get_name()])
 
-    features = { 'feature_name': [], 'feature_type': [], 'feature_description': [] }
+    if self.export_feature_graphs:
+      features_df.apply(lambda o: ft.graph_feature(o['feature_def'], to_file=os.path.join(output_directory, 'feature_graphs', f'{o["feature_id"]}.png'), description=True), axis=1)
 
     if self.export_feature_information:
-      for feature_def in feature_defs:
-        feature_name = feature_def.get_name()
+      features_df['feature_description'] = features_df['feature_def'].apply(lambda o: ft.describe_feature(o))
 
-        features['feature_name'].append(feature_name)
-        features['feature_type'].append(synthesized_dataset.dtypes[feature_name])
-        features['feature_description'].append(ft.describe_feature(feature_def))
+      features_df.drop(columns=['feature_def'], inplace=True)
 
-      pd.DataFrame(features).to_html(os.path.join(output_directory, 'feature_information.html'), index=False)
+      features_df.to_html(os.path.join(output_directory, 'feature_information.html'), index=False)
 
     synthesized_dataset.reset_index(inplace=True)
 

diff --git a/static/image-28.png b/static/image-28.png
diff --git a/static/image-31.png b/static/image-31.png