From 76cf672cf57de1dac957cd6cf5bc44404f54e45c Mon Sep 17 00:00:00 2001
From: zhzhao8888 <43188418+zhzhao8888@users.noreply.github.com>
Date: Wed, 27 Sep 2023 09:54:13 -0700
Subject: [PATCH] Add notebooks for parquet files (#237)

---
 .../AutomateTools_ParquetGenerator.ipynb      | 364 +++++++++++++++
 ...arquetFilesAndIngestionToCustomTable.ipynb | 441 ++++++++++++++++++
 2 files changed, 805 insertions(+)
 create mode 100644 scenario-notebooks/Hunting-Notebooks/AutomateTools_ParquetGenerator.ipynb
 create mode 100644 scenario-notebooks/Hunting-Notebooks/Hunting-QueryParquetFilesAndIngestionToCustomTable.ipynb
diff --git a/scenario-notebooks/Hunting-Notebooks/AutomateTools_ParquetGenerator.ipynb b/scenario-notebooks/Hunting-Notebooks/AutomateTools_ParquetGenerator.ipynb
new file mode 100644
index 0000000..928946f
--- /dev/null
+++ b/scenario-notebooks/Hunting-Notebooks/AutomateTools_ParquetGenerator.ipynb
@@ -0,0 +1,364 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Automate Tools - Parquet Files Generator\n",
+        "\n",
+        "__Notebook Version:__ 1.0<br>\n",
+        "__Python Version:__ Python 3.8<br>\n",
+        "__Apache Spark Version:__ 3.1<br>\n",
+        "__Required Packages:__ azure-monitor-query, azure-mgmt-loganalytics<br>\n",
+        "__Platforms Supported:__  Azure Synapse Analytics\n",
+        "    \n",
+        "### Description\n",
+        "\n",
+        "## Table of Contents\n",
+        "1. Warm-up\n",
+        "2. Azure Log Analytics Data Queries\n",
+        "3. Save result to Azure Log Analytics Custom Table"
+      ],
+      "metadata": {}
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## 1. Warm-up"
+      ],
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Load Python libraries that will be used in this notebook\n",
+        "from azure.mgmt.loganalytics import LogAnalyticsManagementClient\n",
+        "from azure.monitor.query import LogsQueryClient, MetricsQueryClient, LogsQueryStatus\n",
+        "from azure.monitor.ingestion import LogsIngestionClient\n",
+        "from azure.identity import AzureCliCredential, DefaultAzureCredential, ClientSecretCredential\n",
+        "from azure.core.exceptions import  HttpResponseError \n",
+        "\n",
+        "import sys\n",
+        "from datetime import datetime, timezone, timedelta\n",
+        "import requests\n",
+        "import pandas as pd\n",
+        "import numpy\n",
+        "import json\n",
+        "import math\n",
+        "import ipywidgets\n",
+        "from IPython.display import display, HTML, Markdown"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# User input for Log Analytics workspace as the data source for querying\r\n",
+        "subscription_id_source = \"\"\r\n",
+        "resource_group_name_source = \"\"\r\n",
+        "workspace_name_source = \"\"\r\n",
+        "workspace_id_source = \"\"\r\n",
+        "workspace_resource_id_source = \"/subscriptions/{0}/resourceGroups/{1}/providers/Microsoft.OperationalInsights/workspaces/{2}\".format(subscription_id_source, resource_group_name_source, workspace_name_source)\r\n"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "jupyter": {
+          "source_hidden": false,
+          "outputs_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# User input for Log Analytics workspace for data ingestion\r\n",
+        "tenant_id = \"\"\r\n",
+        "subscription_id = \"\"\r\n",
+        "workspace_id = \"\"\r\n",
+        "resource_group_name = \"\"\r\n",
+        "location = \"\"\r\n",
+        "workspace_name = ''\r\n",
+        "workspace_resource_id = \"/subscriptions/{0}/resourceGroups/{1}/providers/Microsoft.OperationalInsights/workspaces/{2}\".format(subscription_id, resource_group_name, workspace_name)\r\n",
+        "data_collection_endpoint_name = \"\"\r\n",
+        "data_collection_rule_name = \"\"\r\n",
+        "custom_table_name = \"\"\r\n",
+        "stream_name = \"Custom-\" + custom_table_name\r\n",
+        "immutable_rule_id = \"\"\r\n",
+        "dce_endpoint = \"\"\r\n",
+        "\r\n",
+        "akv_name = \"\"\r\n",
+        "client_id_name = \"\"\r\n",
+        "client_secret_name = \"\"\r\n",
+        "akv_link_name = \"\""
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "jupyter": {
+          "source_hidden": false,
+          "outputs_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# You may need to change resource_uri for various cloud environments.\r\n",
+        "resource_uri = \"https://api.loganalytics.io\"\r\n",
+        "client_id = mssparkutils.credentials.getSecret(akv_name, client_id_name, akv_link_name)\r\n",
+        "client_secret = mssparkutils.credentials.getSecret(akv_name, client_secret_name, akv_link_name)\r\n",
+        "\r\n",
+        "credential = ClientSecretCredential(\r\n",
+        "    tenant_id=tenant_id, \r\n",
+        "    client_id=client_id, \r\n",
+        "    client_secret=client_secret)\r\n",
+        "access_token = credential.get_token(resource_uri + \"/.default\")\r\n",
+        "token = access_token[0]"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {}
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## 2. Azure Log Analytics Data Queries"
+      ],
+      "metadata": {
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Functions for query\r\n",
+        "def query_la(workspace_id_query, query):\r\n",
+        "    la_data_client = LogsQueryClient(credential=credential)\r\n",
+        "    end_time =  datetime.now(timezone.utc)\r\n",
+        "    start_time = end_time - timedelta(15)\r\n",
+        "\r\n",
+        "    query_result = la_data_client.query_workspace(\r\n",
+        "        workspace_id=workspace_id_query,\r\n",
+        "        query=query,\r\n",
+        "        timespan=(start_time, end_time))\r\n",
+        "    \r\n",
+        "    df_la_query = pd.DataFrame\r\n",
+        "\r\n",
+        "    if query_result.status == LogsQueryStatus.SUCCESS:\r\n",
+        "        if hasattr(query_result, 'tables'):\r\n",
+        "            data = query_result.tables\r\n",
+        "            if len(query_result.tables) > 1:\r\n",
+        "                print('You have more than one tyable to processs')\r\n",
+        "    elif query_result.status == LogsQueryStatus.PARTIAL:\r\n",
+        "        data=query_result.partial_data\r\n",
+        "        print(query_result.partial_error)\r\n",
+        "    else:\r\n",
+        "        print(query_result.error)\r\n",
+        "    \r\n",
+        "    if len(query_result.tables) > 1:\r\n",
+        "        print('You have more than one tyable to processs')\r\n",
+        "    for table in data:\r\n",
+        "        df_la_query = pd.DataFrame(data=table.rows, columns=table.columns)\r\n",
+        "        return df_la_query\r\n",
+        "\r\n",
+        "def slice_query_la(query, lookback_start, lookback_end='0', lookback_unit='h', query_row_limit=400000, split_factor=2):\r\n",
+        "    \"Slice the time to render records <= 500K\"\r\n",
+        "    query = query_template.format(lookback_start, lookback_unit, lookback_end)\r\n",
+        "    count = ' | summarize count()'\r\n",
+        "    count_query = query + count\r\n",
+        "    df_count = query_la(workspace_id_source, count_query)\r\n",
+        "    row_count = df_count['count_'][0]\r\n",
+        "    print(row_count)\r\n",
+        "    df_final = pd.DataFrame()\r\n",
+        "\r\n",
+        "    if row_count > query_row_limit:\r\n",
+        "        number_of_divide = 0\r\n",
+        "        while row_count > query_row_limit:\r\n",
+        "            row_count = row_count / split_factor\r\n",
+        "            number_of_divide = number_of_divide + 1\r\n",
+        "\r\n",
+        "        factor = 2 ** number_of_divide\r\n",
+        "        step_number = math.ceil(int(lookback_start) / factor)\r\n",
+        "\r\n",
+        "        try:\r\n",
+        "            for i in range(int(lookback_end), factor + 1, 1):\r\n",
+        "                if i > 0:\r\n",
+        "                    df_la_query = pd.DataFrame\r\n",
+        "                    current_query = query_template.format(i * step_number, lookback_unit, (i - 1) * step_number)\r\n",
+        "                    print(current_query)\r\n",
+        "                    df_la_query = query_la(workspace_id_source, current_query)\r\n",
+        "                    print(df_la_query.shape[0])\r\n",
+        "                    df_final = pd.concat([df_final, df_la_query])\r\n",
+        "        except:\r\n",
+        "            print(\"query failed\")\r\n",
+        "            raise\r\n",
+        "    else:\r\n",
+        "        df_final = query_la(workspace_id_source, query_template.format(lookback_start, lookback_unit, lookback_end))\r\n",
+        "\r\n",
+        "    return df_final"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "jupyter": {
+          "source_hidden": false,
+          "outputs_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Slice data for query"
+      ],
+      "metadata": {
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Use Dror's test LA table\r\n",
+        "query_template = \"let t1 = SecurityAlert | extend ent = parse_json(Entities)| extend ip = tostring(ent[0]['Address']) | project-keep TimeGenerated, ip; let t2 = CommonSecurityLog | where TimeGenerated > ago({0}{1}) and TimeGenerated <= ago({2}{1}) | project ip = DestinationIP; t1 | join kind=innerunique t2 on ip\"\r\n",
+        "lookback_start = '4'\r\n",
+        "\r\n",
+        "df_final = slice_query_la(query_template, lookback_start)\r\n",
+        "print(df_final.shape[0])"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "jupyter": {
+          "source_hidden": false,
+          "outputs_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "spark.conf.set(\"spark.sql.execution.arrow.enabled\",\"true\")\r\n",
+        "spark_final=spark.createDataFrame(df_final) \r\n",
+        "spark_final.printSchema()\r\n",
+        "spark_final.show()"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "jupyter": {
+          "source_hidden": false,
+          "outputs_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "path = 'abfss://modsynapsefiles@modstorageforsynapse.dfs.core.windows.net/demodata/df_final/{0}'.format(datetime.now().strftime('%Y%m%d%H%M%S'))"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "jupyter": {
+          "source_hidden": false,
+          "outputs_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "spark_final.write.parquet(path, mode='overwrite')"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "jupyter": {
+          "source_hidden": false,
+          "outputs_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "spark.read.parquet(path).count()"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "jupyter": {
+          "source_hidden": false,
+          "outputs_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "name": "synapse_pyspark",
+      "display_name": "Synapse PySpark"
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "description": null,
+    "save_output": true,
+    "synapse_widget": {
+      "version": "0.1",
+      "state": {}
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 2
+}
\ No newline at end of file
diff --git a/scenario-notebooks/Hunting-Notebooks/Hunting-QueryParquetFilesAndIngestionToCustomTable.ipynb b/scenario-notebooks/Hunting-Notebooks/Hunting-QueryParquetFilesAndIngestionToCustomTable.ipynb
new file mode 100644
index 0000000..849bd93
--- /dev/null
+++ b/scenario-notebooks/Hunting-Notebooks/Hunting-QueryParquetFilesAndIngestionToCustomTable.ipynb
@@ -0,0 +1,441 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Hunting - Query Parquet Files and MDTI API and Ingestion to Custom Table\n",
+        "\n",
+        "__Notebook Version:__ 1.0<br>\n",
+        "__Python Version:__ Python 3.8<br>\n",
+        "__Apache Spark Version:__ 3.1<br>\n",
+        "__Required Packages:__ azure-monitor-query, azure-mgmt-loganalytics<br>\n",
+        "__Platforms Supported:__  Azure Synapse Analytics\n",
+        "     \n",
+        "__Data Source Required:__ Log Analytics custom table defined\n",
+        "    \n",
+        "### Description\n",
+        "This notebook provides step-by-step instructions and sample code to query parquet data from Azure Data Lake Storage and then store it back to Log Analytocs pre-defined custom table.<br>\n",
+        "*** Please run the cells sequentially to avoid errors.  Please do not use \"run all cells\". *** <br>\n",
+        "\n",
+        "## Table of Contents\n",
+        "1. Warm-up\n",
+        "2. ADLS Parquet Data Queries\n",
+        "3. Save result to Azure Log Analytics Custom Table"
+      ],
+      "metadata": {}
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## 1. Warm-up"
+      ],
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Load Python libraries that will be used in this notebook\n",
+        "from azure.mgmt.loganalytics import LogAnalyticsManagementClient\n",
+        "from azure.monitor.query import LogsQueryClient, MetricsQueryClient, LogsQueryStatus\n",
+        "from azure.monitor.ingestion import LogsIngestionClient\n",
+        "from azure.identity import AzureCliCredential, DefaultAzureCredential, ClientSecretCredential\n",
+        "from azure.core.exceptions import  HttpResponseError \n",
+        "\n",
+        "import functools\n",
+        "from pyspark.sql import SparkSession\n",
+        "from pyspark.sql.types import *\n",
+        "\n",
+        "import sys\n",
+        "from datetime import datetime, timezone, timedelta\n",
+        "import requests\n",
+        "import pandas as pd\n",
+        "import numpy\n",
+        "import json\n",
+        "import math\n",
+        "import ipywidgets\n",
+        "from IPython.display import display, HTML, Markdown"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# User input for Log Analytics workspace for data ingestion\r\n",
+        "tenant_id = \"\"\r\n",
+        "subscription_id = \"\"\r\n",
+        "workspace_id = \"\"\r\n",
+        "resource_group_name = \"\"\r\n",
+        "location = \"\"\r\n",
+        "workspace_name = \"\"\r\n",
+        "workspace_resource_id = \"/subscriptions/{0}/resourceGroups/{1}/providers/Microsoft.OperationalInsights/workspaces/{2}\".format(subscription_id, resource_group_name, workspace_name)\r\n",
+        "data_collection_endpoint_name = \"\"\r\n",
+        "data_collection_rule_name = \"\"\r\n",
+        "custom_table_name = \"\"\r\n",
+        "stream_name = \"Custom-\" + custom_table_name\r\n",
+        "immutable_rule_id = \"\"\r\n",
+        "dce_endpoint = \"\"\r\n",
+        "\r\n",
+        "akv_name = \"\"\r\n",
+        "client_id_name = \"\"\r\n",
+        "client_secret_name = \"\"\r\n",
+        "akv_link_name = \"\""
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "jupyter": {
+          "source_hidden": false,
+          "outputs_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Inputs for ADLS Parquet file path\r\n",
+        "stroage_account_name = \"\"\r\n",
+        "container_name = \"\"\r\n",
+        "folder_path = \"\"\r\n",
+        "lookback_hours = 8"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "jupyter": {
+          "source_hidden": false,
+          "outputs_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# You may need to change resource_uri for various cloud environments.\r\n",
+        "resource_uri = \"https://api.loganalytics.io\"\r\n",
+        "client_id = mssparkutils.credentials.getSecret(akv_name, client_id_name, akv_link_name)\r\n",
+        "client_secret = mssparkutils.credentials.getSecret(akv_name, client_secret_name, akv_link_name)\r\n",
+        "\r\n",
+        "credential = ClientSecretCredential(\r\n",
+        "    tenant_id=tenant_id, \r\n",
+        "    client_id=client_id, \r\n",
+        "    client_secret=client_secret)\r\n",
+        "access_token = credential.get_token(resource_uri + \"/.default\")\r\n",
+        "token = access_token[0]"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {}
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## 2. ADLS Data Queries"
+      ],
+      "metadata": {
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "current_time = datetime.now()\r\n",
+        "lookback_time = datetime.now() - timedelta(hours = lookback_hours)\r\n",
+        "spark_session = SparkSession.builder.appName('Empty_Dataframe').getOrCreate()\r\n",
+        "\r\n",
+        "def unionAll(dfs):\r\n",
+        "    return functools.reduce(lambda df1, df2: df1.union(df2.select(df1.columns)), dfs)\r\n",
+        "\r\n",
+        "i = 0  \r\n",
+        "for file_info in list_file:\r\n",
+        "    if file_info.isDir:\r\n",
+        "        modified_time = datetime.fromtimestamp(file_info.modifyTime / 1e3)\r\n",
+        "        if modified_time >= lookback_time and modified_time < datetime.now():\r\n",
+        "            print(file_info.name)\r\n",
+        "            path = 'abfss://{0}@{1}.dfs.core.windows.net/{2}/{3}'.format(container_name, stroage_account_name, folder_path, file_info.name)\r\n",
+        "            print(path)\r\n",
+        "            df_parquet = spark.read.parquet(path)\r\n",
+        "            print(df_parquet.count())\r\n",
+        "            if i == 0:\r\n",
+        "                df_spark = df_parquet\r\n",
+        "                i = i + 1\r\n",
+        "            else: \r\n",
+        "                df_spark = unionAll([df_spark, df_parquet])\r\n",
+        "            \r\n",
+        "            "
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "jupyter": {
+          "source_hidden": false,
+          "outputs_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "df_final = df_spark.toPandas()\r\n",
+        "df_final.shape[0]"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "jupyter": {
+          "source_hidden": false,
+          "outputs_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Service Data: MDTI API"
+      ],
+      "metadata": {
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Calling Microsoft MDTI API for List, the same template can be used for calling other Azure REST APIs with different parameters.\r\n",
+        "# For different environments, such as national clouds, you may need to use different root_url, please contact with your admins.\r\n",
+        "# It can be ---.azure.us, ---.azure.microsoft.scloud, ---.azure.eaglex.ic.gov, etc.\r\n",
+        "def call_mdti_api_for_read(token, resource):\r\n",
+        "    \"Calling Microsoft MDTI API\"\r\n",
+        "    headers = {\"Authorization\": token, \"content-type\":\"application/json\" }\r\n",
+        "    root_url = \"https://graph.microsoft.com\"\r\n",
+        "    mdti_url_template = \"{0}/beta/security/threatIntelligence/{1}\"\r\n",
+        "    mdti_url = mdti_url_template.format(root_url, resource)\r\n",
+        "    # print(mdti_url)\r\n",
+        "    try:\r\n",
+        "        response = requests.get(mdti_url, headers=headers, verify=True)\r\n",
+        "        return response\r\n",
+        "    except HttpResponseError as e:\r\n",
+        "        print(f\"Calling MDTI API failed: {e}\")\r\n",
+        "        return None\r\n",
+        "\r\n",
+        "def get_token_for_graph():\r\n",
+        "    resource_uri = \"https://graph.microsoft.com\"\r\n",
+        "    client_id = mssparkutils.credentials.getSecret(akv_name, client_id_name, akv_link_name)\r\n",
+        "    client_secret = mssparkutils.credentials.getSecret(akv_name, client_secret_name, akv_link_name)\r\n",
+        "\r\n",
+        "    credential = ClientSecretCredential(\r\n",
+        "        tenant_id=tenant_id, \r\n",
+        "        client_id=client_id, \r\n",
+        "        client_secret=client_secret)\r\n",
+        "    access_token = credential.get_token(resource_uri + \"/.default\")\r\n",
+        "    return access_token[0]"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "jupyter": {
+          "source_hidden": false,
+          "outputs_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Calling MDTI API, hosts as example\r\n",
+        "header_token_value = \"Bearer {}\".format(get_token_for_graph())\r\n",
+        "response_mdti_host = call_mdti_api_for_read(header_token_value, \"hosts('www.microsoft.com')\")"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "jupyter": {
+          "source_hidden": false,
+          "outputs_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "df_final.loc[df_final['ip'].str.startswith('23.'), 'Fact'] = response_mdti_host.json()[\"registrar\"]"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "jupyter": {
+          "source_hidden": false,
+          "outputs_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "df_merged = df_final.rename(columns = {'TimeGenerated': 'TimeGenerated', 'ip': 'IP', 'Fact': 'Fact'})[['TimeGenerated', 'IP', 'Fact']]"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "jupyter": {
+          "source_hidden": false,
+          "outputs_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## 3. Save result to Azure Log Analytics Custom Table"
+      ],
+      "metadata": {
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# function for data converting\r\n",
+        "def convert_dataframe_to_list_of_dictionaries(df, hasTimeGeneratedColumn):\r\n",
+        "    list = df.to_dict('records')\r\n",
+        "\r\n",
+        "    for row in list:\r\n",
+        "        # The dataframe may have more than one datetime columns, add all datetiome columns inside this loop, to render ISO 8601\r\n",
+        "        if hasTimeGeneratedColumn and row['TimeGenerated'] != None:\r\n",
+        "            row['TimeGenerated']= row['TimeGenerated'].strftime(\"%Y-%m-%dT%H:%M:%S.%fZ\")\r\n",
+        "    \r\n",
+        "    return list\r\n",
+        "\r\n",
+        "def check_dataframe_size_in_mb(df, size_limit_in_mb=25):\r\n",
+        "    \"Check if dataframe has more than 25 MB data, 30 MB is the limit for POST\"\r\n",
+        "    size_in_mb = sys.getsizeof(df) / 1000000\r\n",
+        "    return size_in_mb / size_limit_in_mb\r\n",
+        "\r\n",
+        "def partition_dataframe_for_data_infestion(df):\r\n",
+        "    df_size = check_dataframe_size_in_mb(df)\r\n",
+        "    if df_size > 1:\r\n",
+        "        partition_number = math.ceil(df_size)\r\n",
+        "        index_block = len(df) // partition_number\r\n",
+        "\r\n",
+        "        list_df = [df[i:i+index_block] for i in range(0,df.shape[0],index_block)]\r\n",
+        "        return list_df\r\n",
+        "    else:\r\n",
+        "        return [df]"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "jupyter": {
+          "source_hidden": false,
+          "outputs_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Data ingestion to LA custom table\r\n",
+        "client = LogsIngestionClient(endpoint=dce_endpoint, credential=credential, logging_enable=True)\r\n",
+        "\r\n",
+        "try:\r\n",
+        "    ind = 0\r\n",
+        "    list_df = partition_dataframe_for_data_infestion(df_merged)\r\n",
+        "    for df in list_df:\r\n",
+        "        body = convert_dataframe_to_list_of_dictionaries(df, True)\r\n",
+        "        print(ind)\r\n",
+        "        print(df.shape[0])\r\n",
+        "        ingestion_result = client.upload(rule_id=immutable_rule_id, stream_name=stream_name, logs=body)\r\n",
+        "        ind = ind + 1\r\n",
+        "except HttpResponseError as e:\r\n",
+        "    print(f\"Data ingestion failed: {e}\")"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "jupyter": {
+          "source_hidden": false,
+          "outputs_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "name": "synapse_pyspark",
+      "display_name": "Synapse PySpark"
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "description": null,
+    "save_output": true,
+    "synapse_widget": {
+      "version": "0.1",
+      "state": {}
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 2
+}
\ No newline at end of file