diff --git a/02_data_prep.ipynb b/02_data_prep.ipynb index ff8e999..b67443f 100644 --- a/02_data_prep.ipynb +++ b/02_data_prep.ipynb @@ -65,7 +65,9 @@ "### Key Terms\n", "* *FAIR* or *Findable, Accessible, Interoperable, Reusable* - \n", "* *ARCO data* or *Analysis-ready, cloud-optimised data* - \n", - "* pipeline - A series of tasks or operations, preferably based on resuable components, where the input of one step in the pipeline is the output of the previous step." + "* pipeline - A series of tasks or operations, preferably based on resuable components, where the input of one step in the pipeline is the output of the previous step.\n", + "* workflow - similar to a pipeline as a series of tasks, but there may be multiple consecutive or parallel pipelines in a workflow. (Definitions may differ!)\n", + "* catalog" ] }, { @@ -109,7 +111,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 1, "id": "32d81711-60f1-4996-b808-f24760cbc982", "metadata": {}, "outputs": [], @@ -122,7 +124,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 2, "id": "72aa9221-d29c-4ee4-81d9-1304537d3354", "metadata": {}, "outputs": [], @@ -132,7 +134,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 3, "id": "ddef7864-09de-4d14-8a0e-aa66b1cac6d9", "metadata": {}, "outputs": [], @@ -143,7 +145,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 4, "id": "5fb8d33b-be41-4a20-b636-9395f6c3606d", "metadata": {}, "outputs": [], @@ -155,7 +157,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 5, "id": "73805561-5820-4b3b-b856-cb359b84ffc6", "metadata": {}, "outputs": [ @@ -172,7 +174,7 @@ "PosixPath('/Users/stephen.haddad/data/ukrse2022')" ] }, - "execution_count": 38, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -189,18 +191,18 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 23, "id": "3b8688c7-7db6-4b4d-94bd-fd0c504e6815", "metadata": {}, "outputs": [], "source": [ - "load_from_zenodo=True\n", + "load_from_zenodo=False\n", "zenodo_record_root = 'https://zenodo.org/record/7022648/files/'" ] }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 24, "id": "63754eaa-add1-4269-9100-0abe550ada99", "metadata": {}, "outputs": [ @@ -210,7 +212,7 @@ "PosixPath('https:/zenodo.org/record/7022648/files/2021_met_office_aviation_rotors.csv')" ] }, - "execution_count": 41, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -221,17 +223,24 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 25, "id": "f67317ad-7819-406e-ba53-e9cbbd0388c6", "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "True\n" + ] + }, { "data": { "text/plain": [ - "'https://zenodo.org/record/7022648/files//2021_met_office_aviation_rotors.csv'" + "PosixPath('/Users/stephen.haddad/data/ukrse2022/2021_met_office_aviation_rotors.csv')" ] }, - "execution_count": 46, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -258,10 +267,20 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 26, "id": "706bb929-93ee-4789-ab90-3fc276b6f5ac", "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/stephen.haddad/opt/anaconda3/envs/ukrse2022_mlops_data_prep/lib/python3.8/site-packages/distributed/node.py:179: UserWarning: Port 8787 is already in use.\n", + "Perhaps you already have a cluster running?\n", + "Hosting the HTTP server on port 55878 instead\n", + " warnings.warn(\n" + ] + }, { "data": { "text/html": [ @@ -270,11 +289,11 @@ " \n", "
\n", "

LocalCluster

\n", - "

4e0989a6

\n", + "

de35a1bd

\n", " \n", " \n", " \n", "
\n", - " Dashboard: http://127.0.0.1:8787/status\n", + " Dashboard: http://127.0.0.1:55878/status\n", " \n", " Workers: 4\n", @@ -307,11 +326,11 @@ "
\n", "
\n", "

Scheduler

\n", - "

Scheduler-9a2c49c6-9791-4964-baae-ed2b0f780f0f

\n", + "

Scheduler-199f137b-4d91-4577-87e7-a2df713cd3dd

\n", " \n", " \n", " \n", " \n", " \n", " \n", "
\n", - " Comm: tcp://127.0.0.1:54428\n", + " Comm: tcp://127.0.0.1:55879\n", " \n", " Workers: 4\n", @@ -319,7 +338,7 @@ "
\n", - " Dashboard: http://127.0.0.1:8787/status\n", + " Dashboard: http://127.0.0.1:55878/status\n", " \n", " Total threads: 8\n", @@ -353,7 +372,7 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", @@ -398,7 +417,7 @@ "
\n", - " Comm: tcp://127.0.0.1:54444\n", + " Comm: tcp://127.0.0.1:55900\n", " \n", " Total threads: 2\n", @@ -361,7 +380,7 @@ "
\n", - " Dashboard: http://127.0.0.1:54447/status\n", + " Dashboard: http://127.0.0.1:55901/status\n", " \n", " Memory: 4.00 GiB\n", @@ -369,13 +388,13 @@ "
\n", - " Nanny: tcp://127.0.0.1:54434\n", + " Nanny: tcp://127.0.0.1:55884\n", "
\n", - " Local directory: /var/folders/w0/2x361bn95wj7lfgl33vksx1w0000gn/T/dask-worker-space/worker-kp9i_b4p\n", + " Local directory: /var/folders/w0/2x361bn95wj7lfgl33vksx1w0000gn/T/dask-worker-space/worker-q4noz_vs\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", @@ -443,7 +462,7 @@ "
\n", - " Comm: tcp://127.0.0.1:54443\n", + " Comm: tcp://127.0.0.1:55903\n", " \n", " Total threads: 2\n", @@ -406,7 +425,7 @@ "
\n", - " Dashboard: http://127.0.0.1:54449/status\n", + " Dashboard: http://127.0.0.1:55904/status\n", " \n", " Memory: 4.00 GiB\n", @@ -414,13 +433,13 @@ "
\n", - " Nanny: tcp://127.0.0.1:54431\n", + " Nanny: tcp://127.0.0.1:55885\n", "
\n", - " Local directory: /var/folders/w0/2x361bn95wj7lfgl33vksx1w0000gn/T/dask-worker-space/worker-12pciygk\n", + " Local directory: /var/folders/w0/2x361bn95wj7lfgl33vksx1w0000gn/T/dask-worker-space/worker-3joi2f24\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", @@ -488,7 +507,7 @@ "
\n", - " Comm: tcp://127.0.0.1:54446\n", + " Comm: tcp://127.0.0.1:55894\n", " \n", " Total threads: 2\n", @@ -451,7 +470,7 @@ "
\n", - " Dashboard: http://127.0.0.1:54450/status\n", + " Dashboard: http://127.0.0.1:55896/status\n", " \n", " Memory: 4.00 GiB\n", @@ -459,13 +478,13 @@ "
\n", - " Nanny: tcp://127.0.0.1:54433\n", + " Nanny: tcp://127.0.0.1:55882\n", "
\n", - " Local directory: /var/folders/w0/2x361bn95wj7lfgl33vksx1w0000gn/T/dask-worker-space/worker-kli_b2o9\n", + " Local directory: /var/folders/w0/2x361bn95wj7lfgl33vksx1w0000gn/T/dask-worker-space/worker-x2_l92o8\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", @@ -532,7 +551,7 @@ "" ], "text/plain": [ - "LocalCluster(4e0989a6, 'tcp://127.0.0.1:54428', workers=4, threads=8, memory=16.00 GiB)" + "LocalCluster(de35a1bd, 'tcp://127.0.0.1:55879', workers=4, threads=8, memory=16.00 GiB)" ] }, "metadata": {}, @@ -546,7 +565,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 27, "id": "bdb88a58-70a8-4f0d-af8e-cbbc40e0e63f", "metadata": {}, "outputs": [ @@ -557,7 +576,7 @@ "
\n", "
\n", "

Client

\n", - "

Client-7fdbd90c-2947-11ed-8df5-acde48001122

\n", + "

Client-a887c26e-29d6-11ed-9aa9-acde48001122

\n", "
\n", - " Comm: tcp://127.0.0.1:54445\n", + " Comm: tcp://127.0.0.1:55895\n", " \n", " Total threads: 2\n", @@ -496,7 +515,7 @@ "
\n", - " Dashboard: http://127.0.0.1:54448/status\n", + " Dashboard: http://127.0.0.1:55897/status\n", " \n", " Memory: 4.00 GiB\n", @@ -504,13 +523,13 @@ "
\n", - " Nanny: tcp://127.0.0.1:54432\n", + " Nanny: tcp://127.0.0.1:55883\n", "
\n", - " Local directory: /var/folders/w0/2x361bn95wj7lfgl33vksx1w0000gn/T/dask-worker-space/worker-wpmofvjb\n", + " Local directory: /var/folders/w0/2x361bn95wj7lfgl33vksx1w0000gn/T/dask-worker-space/worker-o15jhgzy\n", "
\n", "\n", " \n", @@ -570,7 +589,7 @@ " \n", " \n", " \n", " \n", " \n", @@ -586,11 +605,11 @@ " \n", "
\n", "

LocalCluster

\n", - "

4e0989a6

\n", + "

de35a1bd

\n", "
\n", - " Dashboard: http://127.0.0.1:8787/status\n", + " Dashboard: http://127.0.0.1:55878/status\n", "
\n", " \n", " \n", "
\n", - " Dashboard: http://127.0.0.1:8787/status\n", + " Dashboard: http://127.0.0.1:55878/status\n", " \n", " Workers: 4\n", @@ -623,11 +642,11 @@ "
\n", "
\n", "

Scheduler

\n", - "

Scheduler-9a2c49c6-9791-4964-baae-ed2b0f780f0f

\n", + "

Scheduler-199f137b-4d91-4577-87e7-a2df713cd3dd

\n", " \n", " \n", " \n", " \n", " \n", " \n", "
\n", - " Comm: tcp://127.0.0.1:54428\n", + " Comm: tcp://127.0.0.1:55879\n", " \n", " Workers: 4\n", @@ -635,7 +654,7 @@ "
\n", - " Dashboard: http://127.0.0.1:8787/status\n", + " Dashboard: http://127.0.0.1:55878/status\n", " \n", " Total threads: 8\n", @@ -669,7 +688,7 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", @@ -714,7 +733,7 @@ "
\n", - " Comm: tcp://127.0.0.1:54444\n", + " Comm: tcp://127.0.0.1:55900\n", " \n", " Total threads: 2\n", @@ -677,7 +696,7 @@ "
\n", - " Dashboard: http://127.0.0.1:54447/status\n", + " Dashboard: http://127.0.0.1:55901/status\n", " \n", " Memory: 4.00 GiB\n", @@ -685,13 +704,13 @@ "
\n", - " Nanny: tcp://127.0.0.1:54434\n", + " Nanny: tcp://127.0.0.1:55884\n", "
\n", - " Local directory: /var/folders/w0/2x361bn95wj7lfgl33vksx1w0000gn/T/dask-worker-space/worker-kp9i_b4p\n", + " Local directory: /var/folders/w0/2x361bn95wj7lfgl33vksx1w0000gn/T/dask-worker-space/worker-q4noz_vs\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", @@ -759,7 +778,7 @@ "
\n", - " Comm: tcp://127.0.0.1:54443\n", + " Comm: tcp://127.0.0.1:55903\n", " \n", " Total threads: 2\n", @@ -722,7 +741,7 @@ "
\n", - " Dashboard: http://127.0.0.1:54449/status\n", + " Dashboard: http://127.0.0.1:55904/status\n", " \n", " Memory: 4.00 GiB\n", @@ -730,13 +749,13 @@ "
\n", - " Nanny: tcp://127.0.0.1:54431\n", + " Nanny: tcp://127.0.0.1:55885\n", "
\n", - " Local directory: /var/folders/w0/2x361bn95wj7lfgl33vksx1w0000gn/T/dask-worker-space/worker-12pciygk\n", + " Local directory: /var/folders/w0/2x361bn95wj7lfgl33vksx1w0000gn/T/dask-worker-space/worker-3joi2f24\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", @@ -804,7 +823,7 @@ "
\n", - " Comm: tcp://127.0.0.1:54446\n", + " Comm: tcp://127.0.0.1:55894\n", " \n", " Total threads: 2\n", @@ -767,7 +786,7 @@ "
\n", - " Dashboard: http://127.0.0.1:54450/status\n", + " Dashboard: http://127.0.0.1:55896/status\n", " \n", " Memory: 4.00 GiB\n", @@ -775,13 +794,13 @@ "
\n", - " Nanny: tcp://127.0.0.1:54433\n", + " Nanny: tcp://127.0.0.1:55882\n", "
\n", - " Local directory: /var/folders/w0/2x361bn95wj7lfgl33vksx1w0000gn/T/dask-worker-space/worker-kli_b2o9\n", + " Local directory: /var/folders/w0/2x361bn95wj7lfgl33vksx1w0000gn/T/dask-worker-space/worker-x2_l92o8\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", @@ -853,10 +872,10 @@ "" ], "text/plain": [ - "" + "" ] }, - "execution_count": 14, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -876,7 +895,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 28, "id": "fc3954b1-5f68-4a2c-bff1-c65831d98a66", "metadata": {}, "outputs": [ @@ -1312,7 +1331,7 @@ "Dask Name: read-csv, 1 tasks" ] }, - "execution_count": 47, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -1324,7 +1343,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 29, "id": "e6082abb-89d4-43cf-a1ad-ae005cbdcffe", "metadata": {}, "outputs": [ @@ -1353,7 +1372,7 @@ " dtype='object')" ] }, - "execution_count": 49, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } @@ -1364,7 +1383,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 30, "id": "5945b6c3-a60d-4c3c-b44b-97e50882a431", "metadata": {}, "outputs": [], @@ -1374,7 +1393,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 31, "id": "f56234e9-1062-40fc-893d-861ed205d63d", "metadata": {}, "outputs": [], @@ -1388,7 +1407,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 32, "id": "55c94c1b-1d8a-45cf-b9c9-065e023bb268", "metadata": {}, "outputs": [ @@ -1427,7 +1446,7 @@ "

Layer1: read-csv

\n", " \n", "

\n", - " read-csv-1e7bfeac4ff4e0386ae4e2f7bd1356d8\n", + " read-csv-e1701bbd29ad5a0d155e1052f943535d\n", "

\n", "\n", "
\n", - " Comm: tcp://127.0.0.1:54445\n", + " Comm: tcp://127.0.0.1:55895\n", " \n", " Total threads: 2\n", @@ -812,7 +831,7 @@ "
\n", - " Dashboard: http://127.0.0.1:54448/status\n", + " Dashboard: http://127.0.0.1:55897/status\n", " \n", " Memory: 4.00 GiB\n", @@ -820,13 +839,13 @@ "
\n", - " Nanny: tcp://127.0.0.1:54432\n", + " Nanny: tcp://127.0.0.1:55883\n", "
\n", - " Local directory: /var/folders/w0/2x361bn95wj7lfgl33vksx1w0000gn/T/dask-worker-space/worker-wpmofvjb\n", + " Local directory: /var/folders/w0/2x361bn95wj7lfgl33vksx1w0000gn/T/dask-worker-space/worker-o15jhgzy\n", "
\n", @@ -1499,7 +1518,7 @@ "

Layer2: getitem

\n", " \n", "

\n", - " getitem-52ee5f0c8086f7484c2a294935631222\n", + " getitem-c0a2e4b8182ad8ec4a49492af87c7f93\n", "

\n", "\n", "
\n", @@ -1526,7 +1545,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -1553,7 +1572,7 @@ "

Layer3: ge

\n", " \n", "

\n", - " ge-fd339e9baf7a08764e14b24e4ee0c0f4\n", + " ge-041d16d620c1d82105c0384ef7a63fcc\n", "

\n", "\n", "
depends on read-csv-1e7bfeac4ff4e0386ae4e2f7bd1356d8read-csv-e1701bbd29ad5a0d155e1052f943535d
\n", @@ -1580,7 +1599,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -1607,7 +1626,7 @@ "

Layer4: getitem

\n", " \n", "

\n", - " getitem-168d0385d0b77b48bcdc5f1a4a2760e3\n", + " getitem-ca5ce8c02ff337a3c95de3fbcaae18b1\n", "

\n", "\n", "
depends on getitem-52ee5f0c8086f7484c2a294935631222getitem-c0a2e4b8182ad8ec4a49492af87c7f93
\n", @@ -1634,7 +1653,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -1661,7 +1680,7 @@ "

Layer5: ge

\n", " \n", "

\n", - " ge-2d6dc26479f8d8416b35538beaee30cf\n", + " ge-d65d970fd632a771c0d65d1f2706fc68\n", "

\n", "\n", "
depends on read-csv-1e7bfeac4ff4e0386ae4e2f7bd1356d8read-csv-e1701bbd29ad5a0d155e1052f943535d
\n", @@ -1688,7 +1707,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -1715,7 +1734,7 @@ "

Layer6: getitem

\n", " \n", "

\n", - " getitem-e01194bec5904b0356b3f48977472acb\n", + " getitem-db9ebe9d70c84abdd54397891f6e7daa\n", "

\n", "\n", "
depends on getitem-168d0385d0b77b48bcdc5f1a4a2760e3getitem-ca5ce8c02ff337a3c95de3fbcaae18b1
\n", @@ -1742,7 +1761,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -1769,7 +1788,7 @@ "

Layer7: ge

\n", " \n", "

\n", - " ge-be9a7d9a8d7053cc079966cd9ec56666\n", + " ge-e33809b42ada954a96d13b04d13f95f7\n", "

\n", "\n", "
depends on read-csv-1e7bfeac4ff4e0386ae4e2f7bd1356d8read-csv-e1701bbd29ad5a0d155e1052f943535d
\n", @@ -1796,7 +1815,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -1823,7 +1842,7 @@ "

Layer8: getitem

\n", " \n", "

\n", - " getitem-0fea9a71b59f2869f636afed6b761a69\n", + " getitem-d7cd9d29ea995ffc0f7766f968d008c1\n", "

\n", "\n", "
depends on getitem-e01194bec5904b0356b3f48977472acbgetitem-db9ebe9d70c84abdd54397891f6e7daa
\n", @@ -1850,7 +1869,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -1877,7 +1896,7 @@ "

Layer9: ge

\n", " \n", "

\n", - " ge-441153336682d2ecb3672b5203c2aff0\n", + " ge-4a31ae9e8d9248371c6015cfd78af04e\n", "

\n", "\n", "
depends on read-csv-1e7bfeac4ff4e0386ae4e2f7bd1356d8read-csv-e1701bbd29ad5a0d155e1052f943535d
\n", @@ -1904,7 +1923,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -1931,7 +1950,7 @@ "

Layer10: and_

\n", " \n", "

\n", - " and_-412f4d3d0aabb7235798beb80f9260f8\n", + " and_-97ac16fd883e17cb7678e54c871156e3\n", "

\n", "\n", "
depends on getitem-0fea9a71b59f2869f636afed6b761a69getitem-d7cd9d29ea995ffc0f7766f968d008c1
\n", @@ -1958,14 +1977,14 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -1992,7 +2011,7 @@ "

Layer11: and_

\n", " \n", "

\n", - " and_-a446f4cea3f7617768b5e4b23d943735\n", + " and_-a6334d5090a4cbc1aa9f0299c4d2a558\n", "

\n", "\n", "
depends on ge-be9a7d9a8d7053cc079966cd9ec56666ge-4a31ae9e8d9248371c6015cfd78af04e
ge-441153336682d2ecb3672b5203c2aff0ge-e33809b42ada954a96d13b04d13f95f7
\n", @@ -2019,14 +2038,14 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -2053,7 +2072,7 @@ "

Layer12: and_

\n", " \n", "

\n", - " and_-f585bb014bb223423d8ed1112f96c7c4\n", + " and_-bdd8caf5d1fd302b329e93e0c1ca81ea\n", "

\n", "\n", "
depends on ge-2d6dc26479f8d8416b35538beaee30cfand_-97ac16fd883e17cb7678e54c871156e3
and_-412f4d3d0aabb7235798beb80f9260f8ge-d65d970fd632a771c0d65d1f2706fc68
\n", @@ -2080,14 +2099,14 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -2114,7 +2133,7 @@ "

Layer13: getitem

\n", " \n", "

\n", - " getitem-dee3807718b30c568c57632889cacc5f\n", + " getitem-52beaf6c5a777671f30c7bd21682d4a8\n", "

\n", "\n", "
depends on and_-a446f4cea3f7617768b5e4b23d943735and_-a6334d5090a4cbc1aa9f0299c4d2a558
ge-fd339e9baf7a08764e14b24e4ee0c0f4ge-041d16d620c1d82105c0384ef7a63fcc
\n", @@ -2166,14 +2185,14 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -2194,23 +2213,23 @@ ], "text/plain": [ "HighLevelGraph with 13 layers.\n", - "\n", - " 0. read-csv-1e7bfeac4ff4e0386ae4e2f7bd1356d8\n", - " 1. getitem-52ee5f0c8086f7484c2a294935631222\n", - " 2. ge-fd339e9baf7a08764e14b24e4ee0c0f4\n", - " 3. getitem-168d0385d0b77b48bcdc5f1a4a2760e3\n", - " 4. ge-2d6dc26479f8d8416b35538beaee30cf\n", - " 5. getitem-e01194bec5904b0356b3f48977472acb\n", - " 6. ge-be9a7d9a8d7053cc079966cd9ec56666\n", - " 7. getitem-0fea9a71b59f2869f636afed6b761a69\n", - " 8. ge-441153336682d2ecb3672b5203c2aff0\n", - " 9. and_-412f4d3d0aabb7235798beb80f9260f8\n", - " 10. and_-a446f4cea3f7617768b5e4b23d943735\n", - " 11. and_-f585bb014bb223423d8ed1112f96c7c4\n", - " 12. getitem-dee3807718b30c568c57632889cacc5f" + "\n", + " 0. read-csv-e1701bbd29ad5a0d155e1052f943535d\n", + " 1. getitem-c0a2e4b8182ad8ec4a49492af87c7f93\n", + " 2. ge-041d16d620c1d82105c0384ef7a63fcc\n", + " 3. getitem-ca5ce8c02ff337a3c95de3fbcaae18b1\n", + " 4. ge-d65d970fd632a771c0d65d1f2706fc68\n", + " 5. getitem-db9ebe9d70c84abdd54397891f6e7daa\n", + " 6. ge-e33809b42ada954a96d13b04d13f95f7\n", + " 7. getitem-d7cd9d29ea995ffc0f7766f968d008c1\n", + " 8. ge-4a31ae9e8d9248371c6015cfd78af04e\n", + " 9. and_-97ac16fd883e17cb7678e54c871156e3\n", + " 10. and_-a6334d5090a4cbc1aa9f0299c4d2a558\n", + " 11. and_-bdd8caf5d1fd302b329e93e0c1ca81ea\n", + " 12. getitem-52beaf6c5a777671f30c7bd21682d4a8" ] }, - "execution_count": 52, + "execution_count": 32, "metadata": {}, "output_type": "execute_result" } @@ -2221,7 +2240,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 33, "id": "66406cce-2550-4c77-a725-41bffc03f275", "metadata": {}, "outputs": [], @@ -2231,7 +2250,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 34, "id": "823b81d5-062e-475b-a44d-95fb760e891e", "metadata": {}, "outputs": [], @@ -2241,7 +2260,7 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 35, "id": "980a89fc-1cbf-4ba2-89cd-b46e38f4c692", "metadata": {}, "outputs": [], @@ -2251,7 +2270,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 36, "id": "02c1c70b-b84b-4f99-b899-166475c5a781", "metadata": {}, "outputs": [ @@ -2687,7 +2706,7 @@ "Dask Name: getitem, 22 tasks" ] }, - "execution_count": 56, + "execution_count": 36, "metadata": {}, "output_type": "execute_result" } @@ -2699,7 +2718,7 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 37, "id": "990d8924-f554-4db8-b6b3-e8bbef59a1ee", "metadata": {}, "outputs": [], @@ -2710,18 +2729,18 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 38, "id": "772aff86-bddd-4a81-8339-1c003e3ffb0e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "((Delayed('int-21baeaa8-bce2-495e-8a6b-7370992d0a50'), 96),\n", - " (Delayed('int-13a8781a-4ac7-442e-84a1-86c411ad3665'), 96))" + "((Delayed('int-85386d55-f64b-4e4f-9b13-f815cd5972b3'), 96),\n", + " (Delayed('int-fe02e4b8-2fae-4de2-a65d-9bc3e107653b'), 96))" ] }, - "execution_count": 58, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } @@ -2740,7 +2759,7 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 39, "id": "322f873d-8089-466b-bf13-ef5e07f31174", "metadata": {}, "outputs": [], @@ -2751,7 +2770,7 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 40, "id": "fcf48ad8-c8b4-4c9b-a83f-030d7c615835", "metadata": {}, "outputs": [ @@ -2761,7 +2780,7 @@ "((17037, 96), (449, 96))" ] }, - "execution_count": 60, + "execution_count": 40, "metadata": {}, "output_type": "execute_result" } @@ -2772,7 +2791,7 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 41, "id": "d53d7c9f-1984-4f84-abee-dfa14a058573", "metadata": {}, "outputs": [ @@ -3159,7 +3178,7 @@ "[17037 rows x 96 columns]" ] }, - "execution_count": 61, + "execution_count": 41, "metadata": {}, "output_type": "execute_result" } @@ -3170,7 +3189,7 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 42, "id": "5123c72c-7eae-44c2-b909-0be1cfe5acb4", "metadata": {}, "outputs": [], @@ -3183,7 +3202,7 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 43, "id": "475b350f-23fa-4a66-acd6-5c77311293d8", "metadata": {}, "outputs": [], @@ -3197,7 +3216,7 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 44, "id": "8049d9c1-6b31-46c6-968b-e1334468f7d8", "metadata": {}, "outputs": [ @@ -3205,8 +3224,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 950 ms, sys: 42.6 ms, total: 993 ms\n", - "Wall time: 995 ms\n" + "CPU times: user 804 ms, sys: 34 ms, total: 838 ms\n", + "Wall time: 827 ms\n" ] } ], @@ -3228,7 +3247,7 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 45, "id": "e8866baa-3f46-4061-b5ac-ae6481bcd27e", "metadata": {}, "outputs": [], @@ -3238,7 +3257,7 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 46, "id": "3dbc5924-5c6b-4f05-95a1-dc4349908783", "metadata": {}, "outputs": [], @@ -3249,7 +3268,7 @@ }, { "cell_type": "code", - "execution_count": 67, + "execution_count": 47, "id": "f4dd943b-472c-492a-b65f-87c6a62fde97", "metadata": {}, "outputs": [ @@ -3623,7 +3642,7 @@ "[17486 rows x 140 columns]" ] }, - "execution_count": 67, + "execution_count": 47, "metadata": {}, "output_type": "execute_result" } @@ -3656,7 +3675,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 49, "id": "cb7a566a-7597-4d9c-92c7-f87910b2611a", "metadata": {}, "outputs": [], @@ -3675,7 +3694,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 50, "id": "bd911ecd-bdd3-4f97-8717-b47e98cb662f", "metadata": {}, "outputs": [ @@ -4127,36 +4146,23 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 51, "id": "b14e01c6-9f01-4a54-a3d5-2914997c278f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "Index(['Unnamed: 0', 'DTG', 'air_temp_obs', 'dewpoint_obs',\n", - " 'wind_direction_obs', 'wind_speed_obs', 'wind_gust_obs', 'air_temp_1',\n", - " 'air_temp_2', 'air_temp_3', 'air_temp_4', 'air_temp_5', 'air_temp_6',\n", - " 'air_temp_7', 'air_temp_8', 'air_temp_9', 'air_temp_10', 'air_temp_11',\n", - " 'air_temp_12', 'air_temp_13', 'air_temp_14', 'air_temp_15',\n", - " 'air_temp_16', 'air_temp_17', 'air_temp_18', 'air_temp_19',\n", - " 'air_temp_20', 'air_temp_21', 'air_temp_22', 'sh_1', 'sh_2', 'sh_3',\n", - " 'sh_4', 'sh_5', 'sh_6', 'sh_7', 'sh_8', 'sh_9', 'sh_10', 'sh_11',\n", - " 'sh_12', 'sh_13', 'sh_14', 'sh_15', 'sh_16', 'sh_17', 'sh_18', 'sh_19',\n", - " 'sh_20', 'sh_21', 'sh_22', 'winddir_1', 'windspd_1', 'winddir_2',\n", - " 'windspd_2', 'winddir_3', 'windspd_3', 'winddir_4', 'windspd_4',\n", - " 'winddir_5', 'windspd_5', 'winddir_6', 'windspd_6', 'winddir_7',\n", - " 'windspd_7', 'winddir_8', 'windspd_8', 'winddir_9', 'windspd_9',\n", - " 'winddir_10', 'windspd_10', 'winddir_11', 'windspd_11', 'winddir_12',\n", - " 'windspd_12', 'winddir_13', 'windspd_13', 'winddir_14', 'windspd_14',\n", - " 'winddir_15', 'windspd_15', 'winddir_16', 'windspd_16', 'winddir_17',\n", - " 'windspd_17', 'winddir_18', 'windspd_18', 'winddir_19', 'windspd_19',\n", - " 'winddir_20', 'windspd_20', 'winddir_21', 'windspd_21', 'winddir_22',\n", - " 'windspd_22', 'rotors_present'],\n", - " dtype='object')" + "Index(['Unnamed: 0', 'air_temp_obs', 'dewpoint_obs', 'wind_direction_obs',\n", + " 'wind_speed_obs', 'wind_gust_obs', 'air_temp_1', 'air_temp_2',\n", + " 'air_temp_3', 'air_temp_4',\n", + " ...\n", + " 'v_wind_18', 'u_wind_19', 'v_wind_19', 'u_wind_20', 'v_wind_20',\n", + " 'u_wind_21', 'v_wind_21', 'u_wind_22', 'v_wind_22', 'time'],\n", + " dtype='object', length=140)" ] }, - "execution_count": 33, + "execution_count": 51, "metadata": {}, "output_type": "execute_result" } @@ -4167,7 +4173,7 @@ }, { "cell_type": "code", - "execution_count": 68, + "execution_count": 52, "id": "749428e9-7cba-4327-9b72-c0f25ac1d71e", "metadata": {}, "outputs": [], @@ -4182,7 +4188,7 @@ }, { "cell_type": "code", - "execution_count": 69, + "execution_count": 53, "id": "5a66447a-244a-4686-8b10-eba2f0cd75f3", "metadata": {}, "outputs": [], @@ -4197,93 +4203,157 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "fda48027-d3dc-4d9a-b295-401b4b6f14c0", - "metadata": {}, - "outputs": [], - "source": [ - "panel.widgets.Select(options=obs_dict)" - ] - }, - { - "cell_type": "code", - "execution_count": null, + "execution_count": 72, "id": "b1859bbe-dfad-4674-a151-ce9e698d2834", "metadata": {}, "outputs": [], "source": [ - "def do_histogram():\n", - " passa" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1eb18b27-432d-47b9-8dfa-d736646ebdc9", - "metadata": {}, - "outputs": [], - "source": [ - "def do_height" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9d4d9ffa-6efb-4c2d-bf8a-b8f4c1ab5227", - "metadata": {}, - "outputs": [], - "source": [ - "def stats_dashboard(select_df, variable)" + "def do_histogram(plotting_df, rotors_feature_name, selected_var, subset):\n", + " fig1 = matplotlib.pyplot.figure(f'histogram of {selected_var}')\n", + " ax1 = fig1.add_subplot(1,1,1,title=f'histogram of {selected_var}')\n", + " if subset == 'all':\n", + " data_to_plot = plotting_df\n", + " elif subset == 'no_rotors':\n", + " data_to_plot = plotting_df[plotting_df[rotors_feature_name] == 0.0]\n", + " elif subset == 'rotors_present':\n", + " data_to_plot = plotting_df[plotting_df[rotors_feature_name] == 1.0]\n", + " _ = data_to_plot[selected_var].plot.hist(bins=20, ax=ax1)\n", + " return fig1" ] }, { "cell_type": "code", - "execution_count": null, - "id": "5faf10cc-0a23-4a89-acac-2824e0783fb7", + "execution_count": 73, + "id": "fda48027-d3dc-4d9a-b295-401b4b6f14c0", "metadata": {}, "outputs": [], "source": [ - "# load data in using ray datasets csv loader" + "hist_var_select = panel.widgets.Select(\n", + " options=obs_dict,\n", + " name='selected_var',\n", + ")" ] }, { "cell_type": "code", - "execution_count": null, - "id": "4fa147a9-30fe-4e45-8436-81ff0cbecf59", + "execution_count": 74, + "id": "b73b2288-a329-4c51-b0e6-647a9925751e", "metadata": {}, "outputs": [], "source": [ - "# define data cleaning as a series of small function which return a dataset" + "subset_select = panel.widgets.Select(\n", + " options = ['all', 'no_rotors', 'rotors_present'],\n", + " name='subset',\n", + ")" ] }, { "cell_type": "code", - "execution_count": null, - "id": "27ebdccc-dc34-4ce5-a731-e776d60323db", + "execution_count": 78, + "id": "58f8f948-da65-49b1-9fb2-128e3a92880c", "metadata": {}, "outputs": [], "source": [ - "# call map_functions on the dataset to clean the data\n" + "hist_plotter = panel.bind(\n", + " functools.partial(\n", + " do_histogram,\n", + " plotting_df=rotors_df,\n", + " rotors_feature_name=target_feature_name,\n", + " ),\n", + " selected_var=hist_var_select,\n", + " subset=subset_select,\n", + ")" ] }, { "cell_type": "code", - "execution_count": null, - "id": "d3788b80-e937-438a-b4e9-3e2ddc2e9d0b", + "execution_count": 85, + "id": "5e9a5dfa-16ee-47c0-9902-d260b95422f4", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX4AAAEICAYAAABYoZ8gAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAWqklEQVR4nO3debRlZX3m8e/DIKBCoJqCFJPlQDC0kcHSjvMASVSMYGfh0FELRUvTpFtjVmvhcsUhSVtJt1OvJK1EoxWHKOIAih3FUrCTNkKhRBGwMVgBrLKqQJEhRoT+9R9nXzncusO5t2qfc6n3+1nrrLPHs3/7rb2eu+s9e++TqkKS1I49Jl2AJGm8DH5JaozBL0mNMfglqTEGvyQ1xuCXpMYY/CLJpiQnzzLviUm+M+6alpIMvD/Jj5Jcuoj1v53kKbu+sqUnycVJXjbpOjS3vSZdgJa2qvrfwDHzLZfkTcDDquqFvRc1fk8Afg04oqruWOjKVfVvF7PR3bxNNUGe8WvJSzLpE5QHAZsWE/rzWQL7pgYZ/JpyfJJvJvlxko8l2RcgyVOS3Di1UJLXJfl+ktuSfCfJSUmeDrweeF6S25P8Y7fsYUkuSPLDJN9N8vKhz9kvyfqu++TqJK+dtp1N3ba+CdyRZK8ka5P8U7ftq5I8Z2j5M5L8fZJ3JLklyXVJHtdNvyHJtiSrZ9v52WpNcibwXuCx3b69eYZ1H5rkS0luTnJTkg8nOXDavpzcDb8pyXlJPpTkVuCMWeqZrU1/Icn7kmzp/h3+KMmei2mDJB9I8u4kF3VtekmSB83WRkPrPS7JZd2xclmSx01b5KFJLu3mn59kWbfevt1+39zVd1mSQ+fbnnpQVb4afwGbgEuBw4BlwNXAK7t5TwFu7IaPAW4ADuvGVwIP7YbfBHxo2udeAvwFsC9wPLAdOKmbt66bfxBwBPDNqe0M1XQFcCSwXzft9K7GPYDnAXcAK7p5ZwB3AS8B9gT+CLge+HNgH+DXgduAB87SBnPVegbwd3O038MYdAXtAywHvgK8c9q+nDzUTj8DTuv2Y785PnemNv008B7gAcAh3b/bKxbTBsAHuvEndfPfNdd+dussA34EvIhBV/ELuvF/082/GPg+8Iiuxk9M7QPwCuAzwP27+h4FHDDp47/F18QL8DX5VxdMLxwa/1Pg3d3wU7gn+B8GbANOBvae9hn3CikGgX03sP/QtLcCH+iGrwN+Y2jey9gx+F86T91XAKd2w2cA1w7N+xWggEOHpt0MHD/D58xX6xnzBeK0zzsN+Ma0fRkO/q+M+DnT2/RQ4KcM/bHogvfLi2mDLvg/OjTvgV07HDlHTS8CLp027avAGd3wxcC6oXnHAnd2Qf9S4P8Aj5z0Md/6y64eTfnB0PC/MAiBe6mq7wKvZhBI25J8NMlhs3zeYcAPq+q2oWn/DBw+NP+GoXnDwzNOS/LiJFd03QS3MDirPHhoka1Dwz/pap4+bYf9GqHWOSU5pGuL73fdNx+aVtd0M+3rKB4E7A1sGWqD9zA485+y0Db4eS1VdTvwQwbtMZvDGLTNsOltdcO0eXszaI8PAp8HPppkc5I/TbL3HNtSTwx+LUhVfaSqnsAghAr4k6lZ0xbdDCxLsv/QtKMYdAMAbGHQxTPlyJk2NzXQ9T3/JfC7DLoVDgSuBLK4PVlQrfN5a1frI6vqAOCF89Q16iNxpy93A4Mz/oOr6sDudUAt8qqhzs/bPckDGXTlbJ5j+c0M/u2HTW+rI6fN+xlwU1X9rKreXFXHAo8DngW8eCdq1yIZ/BpZkmOSPC3JPsC/Mjh7vLubvRVYmWQPgKq6gcF/69/afan3SOBM4MPd8ucCZyc5KMnhDAJ9Lg9gEITbu1pewuCMf6eNUOt89gduB27p9uW/7Iq62LFNtwBfAN6W5IAke3RfLD95J7bxzCRPSHI/4A+Br3XtMZvPAb+U5D90X7g/j0F3zmeHlnlhkmOT3B94C3BeVd2d5KlJfqX7MvpWBn8Q7t5hC+qdwa+F2IfBl7I3MegaOoTBlScAH+/eb07y9W74BQy+AN4MfAp4Y1Vd1M17C3Aj8D3gi8B5DM5mZ1RVVwFvY9CfvJVB//Xf74qdGqHW+bwZOBH4MXAh8MldVNNMbfpi4H7AVQy+VD0PWLET2/gI8EYGXTyPAn57roWr6mYGZ+q/z+D7gtcCz6qqm4YW+yCD7w9+wODL8v/cTf/Frt5bGVxAcAmDbjGNWbovYKSJSvI7wPOramfOXrUAST7A4Av1N0y6Fo2XZ/yaiCQrkjy+6644hsEZ5KcmXZfUAoNfk3I/Blek3AZ8CTifwXX0TUnyv7obtKa/Xj//2r3V9MRZarp9UjVp17KrR5Ia4xm/JDXmPvGAqIMPPrhWrlw56TIk6T7l8ssvv6mqlk+ffp8I/pUrV7Jx48ZJlyFJ9ylJpt9lDdjVI0nNMfglqTEGvyQ1xuCXpMYY/JLUGINfkhpj8EtSYwx+SWqMwS9JjblP3Lmr8Vm59sJFr7tp3Sm7sBJJffGMX5IaY/BLUmMMfklqjMEvSY0x+CWpMQa/JDXG4Jekxhj8ktQYg1+SGmPwS1JjDH5JaozBL0mNMfglqTEGvyQ1ptfgT3JgkvOSXJPk6iSPTbIsyUVJru3eD+qzBknSvfV9xv8u4G+r6uHAccDVwFpgQ1UdDWzoxiVJY9Jb8Cc5AHgS8D6Aqrqzqm4BTgXWd4utB07rqwZJ0o76PON/CLAdeH+SbyR5b5IHAIdW1RaA7v2QHmuQJE3TZ/DvBZwI/M+qOgG4gwV06yRZk2Rjko3bt2/vq0ZJak6fwX8jcGNVfa0bP4/BH4KtSVYAdO/bZlq5qs6pqlVVtWr58uU9lilJbekt+KvqB8ANSY7pJp0EXAVcAKzupq0Gzu+rBknSjvbq+fP/E/DhJPcDrgNewuCPzblJzgSuB07vuQZJ0pBeg7+qrgBWzTDrpD63K0manXfuSlJjDH5JaozBL0mNMfglqTEGvyQ1xuCXpMYY/JLUGINfkhpj8EtSYwx+SWqMwS9JjTH4JakxBr8kNcbgl6TGGPyS1BiDX5IaY/BLUmMMfklqjMEvSY0x+CWpMb3+2LomY+XaCyddgqQlzDN+SWqMwS9Jjem1qyfJJuA24G7grqpalWQZ8DFgJbAJeG5V/ajPOiRJ9xjHGf9Tq+r4qlrVja8FNlTV0cCGblySNCaT6Oo5FVjfDa8HTptADZLUrL6Dv4AvJLk8yZpu2qFVtQWgez9kphWTrEmyMcnG7du391ymJLWj78s5H19Vm5McAlyU5JpRV6yqc4BzAFatWlV9FShJren1jL+qNnfv24BPAY8BtiZZAdC9b+uzBknSvfUW/EkekGT/qWHg14ErgQuA1d1iq4Hz+6pBkrSjPrt6DgU+lWRqOx+pqr9NchlwbpIzgeuB03usQZI0TW/BX1XXAcfNMP1m4KS+titJmpt37kpSYwx+SWqMwS9JjTH4JakxBr8kNcbgl6TGGPyS1BiDX5IaY/BLUmMMfklqTN+PZVZDVq69cKfW37TulF1UiaS5eMYvSY0x+CWpMQa/JDXG4Jekxhj8ktQYg1+SGmPwS1JjDH5JasxIwZ/kEX0XIkkaj1HP+N+d5NIk/zHJgX0WJEnq10jBX1VPAH4bOBLYmOQjSX6t18okSb0YuY+/qq4F3gC8Dngy8D+SXJPk38+1XpI9k3wjyWe78WVJLkpybfd+0M7sgCRpYUbt439kkncAVwNPA36zqn65G37HPKu/qltvylpgQ1UdDWzoxiVJYzLqGf+fAV8Hjquqs6rq6wBVtZnB/wJmlOQI4BTgvUOTTwXWd8PrgdMWWLMkaSeM+ljmZwI/qaq7AZLsAexbVf9SVR+cY713Aq8F9h+admhVbQGoqi1JDplpxSRrgDUARx111Ihl7j529hHHkjSbUc/4vwjsNzR+/27arJI8C9hWVZcvprCqOqeqVlXVquXLly/mIyRJMxj1jH/fqrp9aqSqbk9y/3nWeTzw7CTPBPYFDkjyIWBrkhXd2f4KYNuiKpckLcqoZ/x3JDlxaiTJo4CfzLVCVZ1dVUdU1Urg+cCXquqFwAXA6m6x1cD5C65akrRoo57xvxr4eJLN3fgK4HmL3OY64NwkZwLXA6cv8nMkSYswUvBX1WVJHg4cAwS4pqp+NupGqupi4OJu+GbgpAVXKknaJRbyY+uPBlZ265yQhKr6616qkiT1ZqTgT/JB4KHAFcDd3eQCDH5Juo8Z9Yx/FXBsVVWfxUiS+jfqVT1XAr/YZyGSpPEY9Yz/YOCqJJcCP52aWFXP7qUqaYF25k7nTetOmch2d3bb0mKNGvxv6rMISdL4jHo55yVJHgQcXVVf7O7a3bPf0iRJfRj1qp6XM3hg2jIGV/ccDrwbr8fXLuSD6aTxGPXL3bMYPHvnVvj5j7LM+FRNSdLSNmrw/7Sq7pwaSbIXg+v4JUn3MaMG/yVJXg/s1/3W7seBz/RXliSpL6MG/1pgO/At4BXA55jjl7ckSUvXqFf1/D/gL7uXJOk+bNSrer7HDH36VfWQXV6RJKlXC3lWz5R9GTxDf9muL0eS1LeR+vir6uah1/er6p3A0/otTZLUh1G7ek4cGt2Dwf8A9u+lImnMvHFMrRm1q+dtQ8N3AZuA5+7yaiRJvRv1qp6n9l2IJGk8Ru3qec1c86vq7bumHElS3xZyVc+jgQu68d8EvgLc0EdRkqT+LOSHWE6sqtsAkrwJ+HhVvayvwiRJ/Rj1kQ1HAXcOjd8JrNzl1UiSejfqGf8HgUuTfIrBHbzPAf56rhWS7MugO2ifbjvnVdUbkywDPsbgD8cm4LlV9aNFVS9JWrBRb+D6Y+AlwI+AW4CXVNV/nWe1nwJPq6rjgOOBpyf5VQYPfNtQVUcDG7pxSdKYjNrVA3B/4NaqehdwY5IHz7VwDdzeje7dvQo4FVjfTV8PnLagiiVJO2Wk4E/yRuB1wNndpL2BD42w3p5JrgC2ARdV1deAQ6tqC0D3PuMveSVZk2Rjko3bt28fpUxJ0ghGPeN/DvBs4A6AqtrMCI9sqKq7q+p44AjgMUkeMWphVXVOVa2qqlXLly8fdTVJ0jxGDf47q6roHs2c5AEL2UhV3QJcDDwd2JpkRfc5Kxj8b0CSNCajBv+5Sd4DHJjk5cAXmedHWZIsT3JgN7wfcDJwDYObwFZ3i60Gzl9E3ZKkRZr3cs4kYXD55cOBW4FjgD+oqovmWXUFsD7Jngz+wJxbVZ9N8lUGf0jOBK5n8Gx/SdKYzBv8VVVJPl1VjwLmC/vh9b4JnDDD9JuBkxZUpSRplxm1q+cfkjy610okSWMx6p27TwVemWQTgyt7wuA/A4/sqzBJUj/mDP4kR1XV9cAzxlSPJKln853xf5rBUzn/Ocknquq3xlCTJKlH8/XxZ2j4IX0WIkkaj/mCv2YZliTdR83X1XNcklsZnPnv1w3DPV/uHtBrdZKkXW7O4K+qPcdViCRpPBbyWGZJ0m7A4Jekxhj8ktQYg1+SGmPwS1JjDH5JaozBL0mNMfglqTEGvyQ1xuCXpMYY/JLUGINfkhpj8EtSYwx+SWpMb8Gf5MgkX05ydZJvJ3lVN31ZkouSXNu9H9RXDZKkHfV5xn8X8PtV9cvArwJnJTkWWAtsqKqjgQ3duCRpTHoL/qraUlVf74ZvA64GDgdOBdZ3i60HTuurBknSjub76cVdIslK4ATga8ChVbUFBn8ckhwyyzprgDUARx111DjK3OVWrr1w0iVoiZvUMbJp3SkT2a6Wht6/3E3yQOATwKur6tb5lp9SVedU1aqqWrV8+fL+CpSkxvQa/En2ZhD6H66qT3aTtyZZ0c1fAWzrswZJ0r31eVVPgPcBV1fV24dmXQCs7oZXA+f3VYMkaUd99vE/HngR8K0kV3TTXg+sA85NciZwPXB6jzVIkqbpLfir6u+AzDL7pL62K0mam3fuSlJjDH5JaozBL0mNMfglqTEGvyQ1xuCXpMYY/JLUGINfkhpj8EtSYwx+SWqMwS9JjTH4JakxBr8kNcbgl6TGGPyS1BiDX5IaY/BLUmMMfklqjMEvSY3p88fW7/NWrr1w0iVIvdiZY3vTulN2YSWaBM/4JakxBr8kNaa34E/yV0m2JblyaNqyJBclubZ7P6iv7UuSZtbnGf8HgKdPm7YW2FBVRwMbunFJ0hj1FvxV9RXgh9Mmnwqs74bXA6f1tX1J0szG3cd/aFVtAejeD5ltwSRrkmxMsnH79u1jK1CSdndL9svdqjqnqlZV1arly5dPuhxJ2m2MO/i3JlkB0L1vG/P2Jal54w7+C4DV3fBq4Pwxb1+SmtfbnbtJ/gZ4CnBwkhuBNwLrgHOTnAlcD5ze1/Yl9cO7fu/7egv+qnrBLLNO6mubkqT5LdkvdyVJ/TD4JakxBr8kNcbgl6TGGPyS1BiDX5IaY/BLUmMMfklqjMEvSY0x+CWpMb09smGp2JnnikjS7sgzfklqjMEvSY0x+CWpMQa/JDXG4Jekxhj8ktQYg1+SGmPwS1JjDH5Jasxuf+eupKVjknfSb1p3ysS2vdR4xi9JjTH4JakxE+nqSfJ04F3AnsB7q2rdJOqQpL7tbPdWH11UYz/jT7In8OfAM4BjgRckOXbcdUhSqybR1fMY4LtVdV1V3Ql8FDh1AnVIUpMm0dVzOHDD0PiNwL+bvlCSNcCabvT2JN8ZQ23DDgZuGvM2lzrbZEe2yY6WZJvkTya26Z1qj52s+0EzTZxE8GeGabXDhKpzgHP6L2dmSTZW1apJbX8psk12ZJvsyDa5t6XYHpPo6rkROHJo/Ahg8wTqkKQmTSL4LwOOTvLgJPcDng9cMIE6JKlJY+/qqaq7kvwu8HkGl3P+VVV9e9x1jGBi3UxLmG2yI9tkR7bJvS259kjVDt3rkqTdmHfuSlJjDH5JakyTwZ/kyCRfTnJ1km8neVU3/WNJruhem5Jc0U1fmeQnQ/PePdEd6MEcbXJ8kn/o9ntjkscMrXN2ku8m+U6S35hc9f1YaJs0fpwcl+SrSb6V5DNJDhhap9XjZMY2WRLHSVU19wJWACd2w/sD/xc4dtoybwP+oBteCVw56bon0SbAF4BndNOfCVzcDR8L/COwD/Bg4J+APSe9HxNuk5aPk8uAJ3fTXwr8ocfJrG0y8eOkyTP+qtpSVV/vhm8DrmZwRzEASQI8F/ibyVQ4fnO0SQFTZ2+/wD33XJwKfLSqflpV3wO+y+BxHLuNRbTJbm+ONjkG+Eq32EXAb3XDLR8ns7XJxDUZ/MOSrAROAL42NPmJwNaqunZo2oOTfCPJJUmeOM4ax21am7wa+G9JbgD+O3B2t9hMj944nN3UiG0C7R4nVwLP7madzj03abZ8nMzWJjDh46Tp4E/yQOATwKur6tahWS/g3mf7W4CjquoE4DXAR4b7MHcnM7TJ7wC/V1VHAr8HvG9q0RlW3y2vDV5Am7R8nLwUOCvJ5Qy6O+6cWnSG1Vs5TmZrk8kfJ5PuH5tgv9zeDG4ie8206XsBW4Ej5lj3YmDVpPdhHG0C/Jh77vcIcGs3fDZw9tBynwceO+l9mGSbtHycTJv/S8ClrR8ns7XJUjhOmjzj7/rw3wdcXVVvnzb7ZOCaqrpxaPnl3e8IkOQhwNHAdeOqdxzmaJPNwJO74acBU91fFwDPT7JPkgczaJNLx1XvOCy0TVo+TpIc0r3vAbwBmLpSpdnjZLY2WQrHSas/tv544EXAt6Yu2QReX1WfY/DsoOlf6j4JeEuSu4C7gVdW1Q/HVeyYzNgmwMuBdyXZC/hXukdlV9W3k5wLXAXcBZxVVXePvep+LahNaPs4OTrJWd34J4H3Q/PHyYxtwhI4TnxkgyQ1psmuHklqmcEvSY0x+CWpMQa/JDXG4Jekxhj8ktQYg1+SGvP/Ab78/27tt8YPAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], "source": [ - "# do some holoviz plots" + "rotors_dash = panel.Column(\n", + " panel.widgets.StaticText(value='Rotors Data Exploration Dashboard'),\n", + " panel.Row(subset_select, hist_var_select),\n", + " hist_plotter)" ] }, { "cell_type": "code", - "execution_count": null, - "id": "4bb76688-6f9d-4521-8d13-ceeaa266451a", + "execution_count": 86, + "id": "391151c3-3781-4824-9a43-5aa5dd6578ee", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": {}, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.holoviews_exec.v0+json": "", + "text/html": [ + "
\n", + "
\n", + "
\n", + "" + ], + "text/plain": [ + "Column\n", + " [0] StaticText(value='Rotors Data E...)\n", + " [1] Row\n", + " [0] Select(name='subset', options=['all', 'no_rotors', ...], value='rotors_present')\n", + " [1] Select(name='selected_var', options={'Temperature': 'air_temp_...}, value='air_temp_obs')\n", + " [2] ParamFunction(function)" + ] + }, + "execution_count": 86, + "metadata": { + "application/vnd.holoviews_exec.v0+json": { + "id": "1021" + } + }, + "output_type": "execute_result" + } + ], "source": [ - "# create a panel dashboard (maybe using matplotlib plus holoviews/geovierws?" + "rotors_dash.servable()" ] }, { @@ -4298,9 +4368,17 @@ "intake catalog of data\n" ] }, + { + "cell_type": "markdown", + "id": "9af61e77-6b71-4df8-aeeb-f345906e07a1", + "metadata": {}, + "source": [ + "We will often need to use various domain specific tools and techniques in the data preparation part of the pipeline to prepare the data for use with more generic machine learning and data science tools. In this example we are using the Met Office python library [Iris](https://scitools-iris.readthedocs.io/en/stable/), which handles gridded meteorological (i.e. climate and weather) data and metadata and includes various meteorology specific functionality and handles specific data storage formats such as *NetCDF* and *Grib2*. Other research domains will have similar packages for handling their specific needs, which will similiarly need to be supported as part of a machine learning projects in those research domains." + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 87, "id": "681bdb50-4e64-458c-b901-b26c1cdfc641", "metadata": {}, "outputs": [], @@ -4312,7 +4390,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 88, "id": "a24f0ce6-0cf7-44bf-8865-27d26637fc0c", "metadata": {}, "outputs": [], @@ -4323,10 +4401,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 90, "id": "f0ee36a9-356d-4cbe-b3a7-b411aab36835", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "using default path\n" + ] + }, + { + "data": { + "text/plain": [ + "PosixPath('/Users/stephen.haddad/data/era5')" + ] + }, + "execution_count": 90, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "try:\n", " era5_data_dir = pathlib.Path(os.environ['RSE22_ERA5_DATA_DIR'])\n", @@ -4339,7 +4435,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 91, "id": "e5bce62b-b473-4acb-9f16-93261437efbc", "metadata": {}, "outputs": [], @@ -4349,7 +4445,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 92, "id": "18486bcf-c1a7-4d49-be63-00950ad7c30d", "metadata": {}, "outputs": [], @@ -4359,7 +4455,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 93, "id": "5a154744-c15b-496f-830d-c46c06a83195", "metadata": {}, "outputs": [], @@ -4369,10 +4465,38 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "54da23b2-a3d7-44f2-ad38-b67d9f1e7f62", + "execution_count": 94, + "id": "2e30e6ef-f6ad-4e60-b975-9b98911e7089", + "metadata": {}, + "outputs": [], + "source": [ + "era5_mslp_uk_subset_fname = 'era5_mslp_UK_2017_2020.nc'" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "id": "bcee8066-ed13-400f-be36-04b0ec69d6fc", "metadata": {}, "outputs": [], + "source": [ + "import xarray" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "id": "54da23b2-a3d7-44f2-ad38-b67d9f1e7f62", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "loading cached data\n" + ] + } + ], "source": [ "if do_global_extract:\n", " print('do global_extract')\n", @@ -4383,25 +4507,251 @@ " longitude=uk_na_bounds['longitude'])\n", "else:\n", " print('loading cached data')\n", - " mslp_era5_uk_cube = iris.load_cube(str(era5_data_dir / 'era5_mslp_UK_2017_2020.nc'))" + " if load_from_zenodo:\n", + " mslp_era5_uk_cube = xarray.load_dataarray(zenodo_record_root + era5_mslp_uk_subset_fname).to_iris()\n", + " else:\n", + " mslp_era5_uk_cube = iris.load_cube(str(era5_data_dir / era5_mslp_uk_subset_fname))\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 104, "id": "661e11e8-21ca-423f-9007-8674210dea1c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
depends on and_-f585bb014bb223423d8ed1112f96c7c4read-csv-e1701bbd29ad5a0d155e1052f943535d
read-csv-1e7bfeac4ff4e0386ae4e2f7bd1356d8and_-bdd8caf5d1fd302b329e93e0c1ca81ea
\n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
Air Pressure At Mean Sea Level (Pa)timelatitudelongitude
Shape3506410181
Dimension coordinates
\ttimex--
\tlatitude-x-
\tlongitude--x
Attributes
Conventions CF-1.7
institution ECMWF
nameCDM Mean_sea_level_pressure_surface
nameECMWF Mean sea level pressure
product_type analysis
shortNameECMWF msl
source Reanalysis
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 104, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "mslp_era5_uk_cube" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 105, "id": "5e613eb7-fb2a-4aa4-8fc4-fb1466965a85", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Array Chunk
Bytes 1.07 GiB 121.59 MiB
Shape (35064, 101, 81) (3896, 101, 81)
Count 10 Tasks 9 Chunks
Type float32 numpy.ndarray
\n", + "
\n", + " \n", + "\n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + " 81\n", + " 101\n", + " 35064\n", + "\n", + "
" + ], + "text/plain": [ + "dask.array" + ] + }, + "execution_count": 105, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "print(type(mslp_era5_uk_cube.core_data()))\n", "mslp_era5_uk_cube.core_data()" @@ -4409,7 +4759,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 106, "id": "9c3fd214-9978-4a91-8120-0bc6d37f7de1", "metadata": {}, "outputs": [], @@ -4419,39 +4769,256 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 107, "id": "9f978c34-ad8d-4a46-aaab-c8e8266e3861", "metadata": {}, - "outputs": [], - "source": [ - "mslp_uk_seasonal_mean = mslp_era5_uk_cube.aggregated_by(['season_number'],iris.analysis.MEAN)\n", - "mslp_uk_seasonal_mean" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7c0e96ed-9100-418b-901a-c63f87fe5a2f", - "metadata": {}, - "outputs": [], - "source": [ - "mslp_uk_seasonal_mean.core_data()" - ] - }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
Air Pressure At Mean Sea Level (Pa)--latitudelongitude
Shape410181
Dimension coordinates
\tlatitude-x-
\tlongitude--x
Auxiliary coordinates
\tseason_numberx--
\ttimex--
Cell methods
\tmean season_number
Attributes
Conventions CF-1.7
institution ECMWF
nameCDM Mean_sea_level_pressure_surface
nameECMWF Mean sea level pressure
product_type analysis
shortNameECMWF msl
source Reanalysis
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 107, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mslp_uk_seasonal_mean = mslp_era5_uk_cube.aggregated_by(['season_number'],iris.analysis.MEAN)\n", + "mslp_uk_seasonal_mean" + ] + }, { "cell_type": "code", - "execution_count": null, - "id": "204c8850-5be1-4c31-834e-2c6c8cd6ab68", + "execution_count": 108, + "id": "7c0e96ed-9100-418b-901a-c63f87fe5a2f", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Array Chunk
Bytes 127.83 kiB 31.96 kiB
Shape (4, 101, 81) (1, 101, 81)
Count 76 Tasks 4 Chunks
Type float32 numpy.ndarray
\n", + "
\n", + " \n", + "\n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + " 81\n", + " 101\n", + " 4\n", + "\n", + "
" + ], + "text/plain": [ + "dask.array" + ] + }, + "execution_count": 108, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "era5_flat_deseasoned = era5_flat_deseasoned.compute()\n", - "era5_flat_deseasoned" + "mslp_uk_seasonal_mean.core_data()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 112, "id": "5869d214-0910-4034-98ec-f440a994c777", "metadata": {}, "outputs": [], @@ -4466,7 +5033,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 113, "id": "042d1d6f-ca6a-4780-9027-e79f15ce4de4", "metadata": {}, "outputs": [], @@ -4483,7 +5050,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 114, "id": "2105bfa1-a47c-47e8-9d71-bcdbfde2309b", "metadata": {}, "outputs": [], @@ -4496,7 +5063,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 115, "id": "17852158-7186-4506-aae4-dcc8a2fa5aac", "metadata": {}, "outputs": [], @@ -4509,30 +5076,119 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 116, "id": "4912427b-a826-4d56-8e20-781598b5253f", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "season 2\n" + ] + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYcAAAD3CAYAAAD2S5gLAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAUYUlEQVR4nO3dfZBddX3H8feXbBIwIiBZKCSkxBalqQXFFdHRFmUUwtSmDnQadESpnZRWHPtXwT7awU6L03YsBZtJKbX+UWNVqrEijE8UWsWSaAgECi7BkiWM4cEiD4U8ffvHOWEv97cPJ3DvuZfl/Zq5s/ee87vnfPe7Z89nz7n3no3MRJKkTgcNugBJ0vAxHCRJBcNBklQwHCRJBcNBklQwHCRJhVnDISKujoidEXH7NPMjIi6PiPGI2BIRp/S+TElSm5ocOXwKOGuG+SuBE+rbGuDvnn9ZkqRBmjUcMvNG4JEZhqwCPp2Vm4HDI+KYXhUoSWpfL15zWAJs73g8UU+TJL1AjfRgGTHFtCmvyRERa6hOPbFo0aLXnXjiiT1YvSS9eGzatOmhzBzt93p6EQ4TwHEdj5cCO6YamJnrgHUAY2NjuXHjxh6sXpJePCLif9pYTy9OK20Azq/ftXQa8GhmPtCD5UqSBmTWI4eI+AxwOrA4IiaAPwHmA2TmWuBa4GxgHHgSuKBfxUqS2jFrOGTmebPMT+CDPatIkjRwfkJaklQwHCRJBcNBklQwHCRJBcNBklQwHCRJBcNBklQwHCRJBcNBklQwHCRJBcNBklQwHCRJBcNBklQwHCRJBcNBklQwHCRJBcNBklQwHCRJBcNBklQwHCRJBcNBklQwHCRJhUbhEBFnRcRdETEeEZdMMf+wiPhyRNwaEVsj4oLelypJasus4RAR84ArgZXACuC8iFjRNeyDwB2ZeTJwOvBXEbGgx7VKklrS5MjhVGA8M7dl5i5gPbCqa0wCh0ZEAC8FHgH29LRSSVJrmoTDEmB7x+OJelqnK4CfA3YAtwEfzsx93QuKiDURsTEiNj744IPPsWRJUr81CYeYYlp2PT4T2AwcC7wGuCIiXlY8KXNdZo5l5tjo6OgBlipJakuTcJgAjut4vJTqCKHTBcA1WRkH7gVO7E2JkqS2NQmHW4ATImJ5/SLzamBD15j7gDMAIuJo4FXAtl4WKklqz8hsAzJzT0RcBFwPzAOuzsytEXFhPX8tcCnwqYi4jeo01MWZ+VAf65Yk9dGs4QCQmdcC13ZNW9txfwfwjt6WJkkaFD8hLUkqGA6SpILhIEkqGA6SpILhIEkqGA6SpILhIEkqGA6SpILhIEkqGA6SpILhIEkqGA6SpILhIEkqGA6SpILhIEkqGA6SpILhIEkqGA6SpILhIEkqGA6SpILhIEkqNAqHiDgrIu6KiPGIuGSaMadHxOaI2BoR/97bMiVJbRqZbUBEzAOuBN4OTAC3RMSGzLyjY8zhwCeBszLzvog4qk/1SpJa0OTI4VRgPDO3ZeYuYD2wqmvMu4FrMvM+gMzc2dsyJUltahIOS4DtHY8n6mmdXgkcERE3RMSmiDi/VwVKkto362klIKaYllMs53XAGcAhwHci4ubMvPtZC4pYA6wBWLZs2YFXK0lqRZMjhwnguI7HS4EdU4y5LjOfyMyHgBuBk7sXlJnrMnMsM8dGR0efa82SpD5rEg63ACdExPKIWACsBjZ0jfkS8JaIGImIlwBvAO7sbamSpLbMelopM/dExEXA9cA84OrM3BoRF9bz12bmnRFxHbAF2AdclZm397NwSVL/RGb3ywftGBsby40bNw5k3ZL0QhURmzJzrN/r8RPSkqSC4SBJKhgOkqSC4SBJKhgOkqSC4SBJKhgOkqSC4SBJKhgOkqSC4SBJKhgOkqSC4SBJKhgOkqSC4SBJKhgOkqSC4SBJKhgOkqSC4SBJKhgOkqSC4SBJKhgOkqSC4SBJKjQKh4g4KyLuiojxiLhkhnGvj4i9EXFu70qUJLVt1nCIiHnAlcBKYAVwXkSsmGbcZcD1vS5SktSuJkcOpwLjmbktM3cB64FVU4z7EPAFYGcP65MkDUCTcFgCbO94PFFPe0ZELAHeBaydaUERsSYiNkbExgcffPBAa5UktaRJOMQU07Lr8SeAizNz70wLysx1mTmWmWOjo6MNS5QktW2kwZgJ4LiOx0uBHV1jxoD1EQGwGDg7IvZk5hd7UaQkqV1NwuEW4ISIWA7cD6wG3t05IDOX778fEZ8C/s1gkKQXrlnDITP3RMRFVO9CmgdcnZlbI+LCev6MrzNIkl54mhw5kJnXAtd2TZsyFDLz/c+/LEnSIPkJaUlSwXCQJBUMB0lSwXCQJBUMB0lSwXCQJBUMB0lSwXCQJBUMB0lSwXCQJBUMB0lSwXCQJBUMB0lSwXCQJBUMB0lSwXCQJBUMB0lSwXCQJBUMB0lSwXCQJBUMB0lSoVE4RMRZEXFXRIxHxCVTzH9PRGypb9+OiJN7X6okqS2zhkNEzAOuBFYCK4DzImJF17B7gV/KzJOAS4F1vS5UktSeJkcOpwLjmbktM3cB64FVnQMy89uZ+eP64c3A0t6WKUlqU5NwWAJs73g8UU+bzgeArz6foiRJgzXSYExMMS2nHBjxVqpwePM089cAawCWLVvWsERJUtuaHDlMAMd1PF4K7OgeFBEnAVcBqzLz4akWlJnrMnMsM8dGR0efS72SpBY0CYdbgBMiYnlELABWAxs6B0TEMuAa4L2ZeXfvy5QktWnW00qZuSciLgKuB+YBV2fm1oi4sJ6/Fvhj4EjgkxEBsCczx/pXtiSpnyJzypcP+m5sbCw3btw4kHVL0gtVRGxq449vPyEtSSoYDpKkguEgSSoYDpKkguEgSSoYDpKkguEgSSoYDpKkguEgSSoYDpKkguEgSSoYDpKkguEgSSoYDpKkguEgSSoYDpKkguEgSSoYDpKkguEgSSoYDpKkguEgSSo0CoeIOCsi7oqI8Yi4ZIr5ERGX1/O3RMQpvS9VktSWWcMhIuYBVwIrgRXAeRGxomvYSuCE+rYG+Lse1ylJalGTI4dTgfHM3JaZu4D1wKquMauAT2flZuDwiDimx7VKklrSJByWANs7Hk/U0w50jCTpBWKkwZiYYlo+hzFExBqq004AT0fE7Q3W/2KwGHho0EUMCXsxyV5MsheTXtXGSpqEwwRwXMfjpcCO5zCGzFwHrAOIiI2ZOXZA1c5R9mKSvZhkLybZi0kRsbGN9TQ5rXQLcEJELI+IBcBqYEPXmA3A+fW7lk4DHs3MB3pcqySpJbMeOWTmnoi4CLgemAdcnZlbI+LCev5a4FrgbGAceBK4oH8lS5L6rclpJTLzWqoA6Jy2tuN+Ah88wHWvO8Dxc5m9mGQvJtmLSfZiUiu9iGq/LknSJC+fIUkq9D0cvPTGpAa9eE/dgy0R8e2IOHkQdbZhtl50jHt9ROyNiHPbrK9NTXoREadHxOaI2BoR/952jW1p8DtyWER8OSJurXsxJ1/fjIirI2LndG/3b2W/mZl9u1G9gH0P8ApgAXArsKJrzNnAV6k+K3Ea8N1+1jSoW8NevAk4or6/8sXci45x36R6vevcQdc9wO3icOAOYFn9+KhB1z3AXvw+cFl9fxR4BFgw6Nr70ItfBE4Bbp9mft/3m/0+cvDSG5Nm7UVmfjszf1w/vJnq8yJzUZPtAuBDwBeAnW0W17ImvXg3cE1m3geQmXO1H016kcChERHAS6nCYU+7ZfZfZt5I9b1Np+/7zX6Hg5femHSg3+cHqP4ymItm7UVELAHeBaxlbmuyXbwSOCIiboiITRFxfmvVtatJL64Afo7qQ7a3AR/OzH3tlDdU+r7fbPRW1uehZ5femAMaf58R8VaqcHhzXysanCa9+ARwcWburf5InLOa9GIEeB1wBnAI8J2IuDkz7+53cS1r0oszgc3A24CfAb4WETdl5k/6XNuw6ft+s9/h0LNLb8wBjb7PiDgJuApYmZkPt1Rb25r0YgxYXwfDYuDsiNiTmV9spcL2NP0deSgznwCeiIgbgZOBuRYOTXpxAfAXWZ14H4+Ie4ETgf9qp8Sh0ff9Zr9PK3npjUmz9iIilgHXAO+dg38Vdpq1F5m5PDOPz8zjgc8DvzMHgwGa/Y58CXhLRIxExEuANwB3tlxnG5r04j6qIygi4miqi9Bta7XK4dD3/WZfjxzSS288o2Ev/hg4Evhk/RfznpyDFxtr2IsXhSa9yMw7I+I6YAuwD7gqM+fcFY0bbheXAp+KiNuoTq1cnJlz7mqtEfEZ4HRgcURMAH8CzIf29pt+QlqSVPAT0pKkguEgSSoYDpKkguEgSSoYDpKkguEgSSoYDpKkguEgSSoYDpKkguEgSSoYDpKkguEgSSoYDpKkguEgSSoYDpKkguEgSSoYDpKkQl//TehMRhcuy137nhrU6hvZm3vIhSPMO2jBoEuZ0d59u9g3Pzho/nDXuW/3LvYtgBjyOnP3LvaNJActHN469z29iwXz93LQwfMHXcqM9j21m0Uju5h/8LxBlzKj3U/t5dCDnuKQQ2LQpcxq6227r8/Ms/q9noGFw659T/Gmxb82qNU38ujunew+/mheduiSQZcyo588dj+PL53PosXHDbqUGT3x0HaePCY4+NjhrvOpHdvZ9VO7WXj80kGXMq2nfzjBsUf9L4e+8uhBlzKjx+7+EW848ocs+fnDBl3KjO7f+ihnLPpvXn3ScIctwInLHljcxno8rSRJKhgOkqSC4SBJKhgOkqSC4SBJKhgOkqSC4SBJKhgOkqSC4SBJKhgOkqSC4SBJKhgOkqSC4SBJKhgOkqRCZOZgVhxxHdDKpWclaQ55qI3/5zCwcJAkDS9PK0mSCoaDJKmUmQd0A64GdgK3d0x7OfA14Af11yM65n0EGAfuAs7smP7rwBZgK/DxjukLgc/Wz/kucHzHvPfV6/gB8L5e1QkcCXwLeBy4oms5rwNuq+u5nMlTce8HHgQ217ffHFSdwEuArwD/XffzL4a4n9cBt9Z1rgXmDVs/63k3UG2z++s5qlf97GGNC4B1wN31z/6cIe3leVS/Q1vqn//iId02p9snDaKfbwc21X3bBLyt4zl/BmwHHu9afk/6+cxzmgzqKuAXgVO6vsGPA5fU9y8BLqvvr6DaESwElgP3APPqH9B9wGg97p+AM+r7vwOsre+vBj7b0cRt9dcj6vtH9KjORcCbgQun2GD+C3gjEMBXgZUdG8wVU6y39TqpwuGtHTuMmzrqHLZ+vqz+GsAXgNXD1s963g3A2BTLf9797GGNfwp8rL5/EJM73aHpJdX/qd/ZUdvHgY8O27bJzPukQfTztcCx9f1XA/d3POc04BjKcOhJP/ffDvi0UmbeCDzSNXlV3cz9Tf3VjunrM/PpzLyXKtFOBV4B3J2ZD9bjvg6cM8WyPg+cEREBnAl8LTMfycwfU6XstK/YH0idmflEZv4H8FTn4Ig4hmpn9p2suvzpju9tOq3XmZlPZua36vu7gO8BS6dY1kD7Wc/7SX13hCrIcrp11gZS5wyedz97WONvAH9ej9uXmQ/NUvsgehn1bVHdp5cBO6ZY1qC3zZn2SdPpZ53fz8z9fdoKHBwRC+t5N2fmA1Osoif93K9Xrzkcvb/Y+utR9fQlVIc/+03U08aBEyPi+IgYoWrIcd3Pycw9wKNUqT7dsnpR53SW1OuZbp3nRMSWiPh8RBT1t1jnMyLicOCdwDe66xmCfu6v8XqqvyYfo9qI9xu2fv5jRGyOiD+qf8meVU+P+3lANdY/Z4BLI+J7EfG5iDi6Y8hQ9DIzdwO/TXV6ZAfV2YR/6K5nCLbNmfZJMNh+ngN8PzOfnmVZPe1nv1+QjimmZZ1ev011fuwm4IfAnpmeM8P0fpppnV+mOqd3EtVfGf/U4Dl9VW/UnwEuz8xts9QzsDoz80yqw+KFwNvqycPWz/dk5i8Ab6lv752lnrbrHKE6OvzPzDwF+A7wl/W8oellRMyn+l1/LXAs1Tn9j8xST+t1zrJPGlg/I+LngcuA32oyfJp6nlOdvQqHH9WnYPafitlZT5/g2em7lPqQMjO/nJlvyMw3Ur3w94Pu59Q7u8OoDsWmXVYP6pzOBJOnZ7rrf7gjyf+e6oXrZ9XfYp37rQN+kJmf6Jg2TP18RmY+BWygOhQeun5m5v3118eAf6Y6HfqsenrczwOt8WHgSeBf68efozqfPWy9fE1d0z31qdl/Ad7UXc8wbJvT7ZMG1c+IWEr18z0/M+9psKye9rNX4bCB6tVw6q9f6pi+OiIWRsRy4ASqF3iJiKPqr0dQvZBy1RTLOhf4Zr1RXQ+8IyKOqJ/zjnpaL+qcUn2Y91hEnFafVjh//3P2/0BrvwLcWd9vvc66no9RbQy/O8OyBtrPiHhpxy/CCHA21btshqqfETESEYvr+/OBXwZun2JZvezngW6bSfUX7en1pDOAO+qah6aXwP3AiogYrR+/vaOeodk2Yfp90iD6WZ82/Arwkcz8z+ewrOffz5zlFevuG9VpiweA3VSJ9AGq81rfoErabwAv7xj/B1TvUrqL+h00Hcu5o76t7ph+MNVfQeNUQfKKjnm/UU8fBy7ocZ0/pErZx+vxK+rpY1Q7hnuAK5h8K+ufU71QdCvVW+NOHFSdVH8JJNVGu5mOt9sNUz+Bo4FbmHy74N8CI0PYz0VUbx/cX+ffMPmW2+fdz17UWE//aeDGus5vAMuGrZf19Aupts0tVIF25LBtm7Psk1rvJ/CHwBNM/j5vZvLt1B+vn7+v/vrRXvZz/83LZ0iSCn5CWpJUMBwkSQXDQZJUMBwkSQXDQZJUGBl0AdKgRcReqss7jFC95fJ9mfnkYKuSBssjBwn+LzNfk5mvBnZRvS9felEzHKRnuwn42Yh4Z0R8NyK+HxFf77qonTTnGQ5Srb6kx0qqU0z/AZyWma8F1gO/N8japLb5moMEh0TE5vr+TVSXlH4V8Nn6ujoLgHsHVJs0EF4+Qy96EfF4Zr60a9oNwF9n5oaIOJ3q+jWnt1+dNBieVpKmdhjVFUVh8kqX0ouG4SBN7aPA5yLiJmC2f78pzTmeVpIkFTxykCQVDAdJUsFwkCQVDAdJUsFwkCQVDAdJUsFwkCQVDAdJUuH/AQp/DphEs/BoAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], "source": [ "seasons_dash = panel.Column(season_select, season_plotter)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 117, "id": "020e0a4b-3759-45bb-aa6d-d12a767a9116", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "season 1
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.holoviews_exec.v0+json": "", + "text/html": [ + "
\n", + "
\n", + "
\n", + "" + ], + "text/plain": [ + "Column\n", + " [0] Select(name='season', options={'NH Winter': 0, ...}, value=2)\n", + " [1] ParamFunction(function)" + ] + }, + "execution_count": 117, + "metadata": { + "application/vnd.holoviews_exec.v0+json": { + "id": "1030" + } + }, + "output_type": "execute_result" + } + ], "source": [ "seasons_dash.servable()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 118, "id": "e54fad76-b0a8-4516-9f83-3dcbe48c2db8", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], "source": [ "fig1 = matplotlib.pyplot.figure(figsize=(9,16))\n", "for ix1 in range(mslp_uk_seasonal_mean.shape[0]):\n", @@ -4543,20 +5199,99 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "16ebbf94-8f8c-4933-b024-07140ec1e189", - "metadata": {}, - "outputs": [], - "source": [ - "dask.array.concatenate" - ] - }, - { - "cell_type": "code", - "execution_count": null, + "execution_count": 119, "id": "9aecfd1e-3d10-4b2f-9be6-f162e2cfe2dc", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Array Chunk
Bytes 1.07 GiB 68.91 MiB
Shape (35064, 8181) (2208, 8181)
Count 159 Tasks 25 Chunks
Type float32 numpy.ndarray
\n", + "
\n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + " 8181\n", + " 35064\n", + "\n", + "
" + ], + "text/plain": [ + "dask.array" + ] + }, + "execution_count": 119, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "era5_flat_deseasoned = dask.array.concatenate(\n", " [(mslp_era5_uk_cube.extract(iris.Constraint(season_number=sn1)).core_data() - mslp_uk_seasonal_mean[sn1].core_data()).reshape(\n", @@ -4566,17 +5301,29 @@ "era5_flat_deseasoned" ] }, + { + "cell_type": "code", + "execution_count": 121, + "id": "561c8031-6a29-4492-9543-f8788229c412", + "metadata": {}, + "outputs": [], + "source": [ + "era5_flat_deseasoned = era5_flat_deseasoned.compute()" + ] + }, { "cell_type": "markdown", "id": "9ee55a13-592d-4423-b5ea-7e174231c623", "metadata": {}, "source": [ - "### Example 3 - Creating a catalog with Intake" + "## Example 3 - Creating a catalog with Intake\n", + "\n", + "When you think of a dataset, you probably think of a series of files in some format (often NetCDF files for weather and climate data, maybe [FITS](https://en.wikipedia.org/wiki/FITS) in astronomy or [BAM files](https://samtools.github.io/hts-specs/SAMv1.pdf) in genetics. Researchers in general don't care about the files themselves, but rather getting the data they contained loaded in memeory. There are now better options for presenting data than just a unorder collection of files. One such option is a *catalog*. The idea here is the curated data files files are presented logically in terms of the different datasets, which each logical dataset presented as a single object, irrespective of how the data is actally stored on disk. There could be one file or many files, or it could be coming from a database or an API, the researcher sees a dataset, which can then be requested and loaded into memory (either actually loading the data or " ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 122, "id": "5b4969f9-8c14-42c9-9c5e-b6fec5f893c4", "metadata": {}, "outputs": [], @@ -4586,7 +5333,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 123, "id": "c2f0325c-6f84-4527-89ba-fec74e232f66", "metadata": {}, "outputs": [ @@ -4617,17 +5364,17 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 124, "id": "b94d55b3-f698-4449-9620-e1313f613a7f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['rotors']" + "['rotors', 'rotors_preprocessed']" ] }, - "execution_count": 25, + "execution_count": 124, "metadata": {}, "output_type": "execute_result" } @@ -4638,7 +5385,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 125, "id": "a19b3021-4b10-4f30-a0e3-56611968a2d2", "metadata": {}, "outputs": [ @@ -5025,7 +5772,7 @@ "[20106 rows x 96 columns]" ] }, - "execution_count": 26, + "execution_count": 125, "metadata": {}, "output_type": "execute_result" } @@ -5046,25 +5793,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 131, "id": "f49fc329-be5f-4881-b31b-8596e381e6cd", "metadata": {}, "outputs": [], "source": [ - "# this could be pointing at cloud storage\n", "import fsspec" ] }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 132, "id": "35464e52-6af1-4876-bd47-28004fec7936", "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
Dask DataFrame Structure:
\n", "
\n", "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0DTGair_temp_obsdewpoint_obswind_direction_obswind_speed_obswind_gust_obsair_temp_1air_temp_2air_temp_3...windspd_18winddir_19windspd_19winddir_20windspd_20winddir_21windspd_21winddir_22windspd_22rotors_present
00NaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaN0.0
1101/01/2015 00:00283.9280.7110.04.1-9999999.0284.000283.625283.250...5.8341.06.0334.06.1330.06.0329.05.80.0
2201/01/2015 03:00280.7279.790.07.7-9999999.0281.500281.250280.750...6.8344.05.3348.03.8360.03.212.03.50.0
3301/01/2015 06:00279.8278.1100.07.7-9999999.0279.875279.625279.125...6.0345.05.5358.05.010.04.238.04.00.0
4401/01/2015 09:00279.9277.0120.07.2-9999999.0279.625279.250278.875...3.1338.03.5354.03.99.04.422.04.60.0
..................................................................
201012010131/12/2020 06:00276.7275.5270.03.6-9999999.0277.875277.750277.625...12.1223.011.8221.011.4219.011.3215.011.40.0
201022010231/12/2020 09:00277.9276.9270.03.1-9999999.0277.875277.625277.875...10.2230.010.8230.011.6227.012.3222.012.00.0
201032010331/12/2020 12:00283.5277.1220.03.6-9999999.0281.125280.625280.125...10.3218.011.9221.012.8222.011.9225.010.60.0
201042010431/12/2020 15:00286.1276.9250.03.6-9999999.0284.625284.125283.625...9.4218.08.6212.08.3218.08.7226.010.10.0
201052010501/01/2021 00:00285.1279.3300.06.2-9999999.0284.250284.000283.750...8.6241.010.2236.010.5232.010.5227.011.30.0
\n", + "

20106 rows × 96 columns

\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 DTG air_temp_obs dewpoint_obs \\\n", + "0 0 NaN NaN NaN \n", + "1 1 01/01/2015 00:00 283.9 280.7 \n", + "2 2 01/01/2015 03:00 280.7 279.7 \n", + "3 3 01/01/2015 06:00 279.8 278.1 \n", + "4 4 01/01/2015 09:00 279.9 277.0 \n", + "... ... ... ... ... \n", + "20101 20101 31/12/2020 06:00 276.7 275.5 \n", + "20102 20102 31/12/2020 09:00 277.9 276.9 \n", + "20103 20103 31/12/2020 12:00 283.5 277.1 \n", + "20104 20104 31/12/2020 15:00 286.1 276.9 \n", + "20105 20105 01/01/2021 00:00 285.1 279.3 \n", + "\n", + " wind_direction_obs wind_speed_obs wind_gust_obs air_temp_1 \\\n", + "0 NaN NaN NaN NaN \n", + "1 110.0 4.1 -9999999.0 284.000 \n", + "2 90.0 7.7 -9999999.0 281.500 \n", + "3 100.0 7.7 -9999999.0 279.875 \n", + "4 120.0 7.2 -9999999.0 279.625 \n", + "... ... ... ... ... \n", + "20101 270.0 3.6 -9999999.0 277.875 \n", + "20102 270.0 3.1 -9999999.0 277.875 \n", + "20103 220.0 3.6 -9999999.0 281.125 \n", + "20104 250.0 3.6 -9999999.0 284.625 \n", + "20105 300.0 6.2 -9999999.0 284.250 \n", + "\n", + " air_temp_2 air_temp_3 ... windspd_18 winddir_19 windspd_19 \\\n", + "0 NaN NaN ... NaN NaN NaN \n", + "1 283.625 283.250 ... 5.8 341.0 6.0 \n", + "2 281.250 280.750 ... 6.8 344.0 5.3 \n", + "3 279.625 279.125 ... 6.0 345.0 5.5 \n", + "4 279.250 278.875 ... 3.1 338.0 3.5 \n", + "... ... ... ... ... ... ... \n", + "20101 277.750 277.625 ... 12.1 223.0 11.8 \n", + "20102 277.625 277.875 ... 10.2 230.0 10.8 \n", + "20103 280.625 280.125 ... 10.3 218.0 11.9 \n", + "20104 284.125 283.625 ... 9.4 218.0 8.6 \n", + "20105 284.000 283.750 ... 8.6 241.0 10.2 \n", + "\n", + " winddir_20 windspd_20 winddir_21 windspd_21 winddir_22 windspd_22 \\\n", + "0 NaN NaN NaN NaN NaN NaN \n", + "1 334.0 6.1 330.0 6.0 329.0 5.8 \n", + "2 348.0 3.8 360.0 3.2 12.0 3.5 \n", + "3 358.0 5.0 10.0 4.2 38.0 4.0 \n", + "4 354.0 3.9 9.0 4.4 22.0 4.6 \n", + "... ... ... ... ... ... ... \n", + "20101 221.0 11.4 219.0 11.3 215.0 11.4 \n", + "20102 230.0 11.6 227.0 12.3 222.0 12.0 \n", + "20103 221.0 12.8 222.0 11.9 225.0 10.6 \n", + "20104 212.0 8.3 218.0 8.7 226.0 10.1 \n", + "20105 236.0 10.5 232.0 10.5 227.0 11.3 \n", + "\n", + " rotors_present \n", + "0 0.0 \n", + "1 0.0 \n", + "2 0.0 \n", + "3 0.0 \n", + "4 0.0 \n", + "... ... \n", + "20101 0.0 \n", + "20102 0.0 \n", + "20103 0.0 \n", + "20104 0.0 \n", + "20105 0.0 \n", + "\n", + "[20106 rows x 96 columns]" + ] + }, + "execution_count": 134, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "intake.open_csv(fsspec.open_local(rotors_path)).read()" + ] + }, + { + "cell_type": "markdown", + "id": "edaaac56-cd20-41ce-bcf7-4c82ab87fdaa", + "metadata": {}, + "source": [ + "Now we can try to programmatically create a catalog " + ] + }, + { + "cell_type": "code", + "execution_count": 136, + "id": "b2d12385-743f-4297-9b4b-c3afccef6a76", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\"sources:\\n csv:\\n args:\\n urlpath:\\n - /Users/stephen.haddad/data/ukrse2022/2021_met_office_aviation_rotors.csv\\n description: ''\\n driver: intake.source.csv.CSVSource\\n metadata: {}\\n\"" + ] + }, + "execution_count": 136, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "intake.source.csv.CSVSource(fsspec.open_local(rotors_path)).yaml()" + ] + }, + { + "cell_type": "code", + "execution_count": 137, + "id": "0da242f4-cd39-461b-bff2-f5ad748f5b6a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "sources:\n", + " csv:\n", + " args:\n", + " urlpath:\n", + " - /Users/stephen.haddad/data/ukrse2022/2021_met_office_aviation_rotors.csv\n", + " description: \"Tabular dataset with observed and simulated weather data, \\n \\\n", + " \\ intended for use training a machine learning model predicting turbulent wind\\\n", + " \\ gust events.\"\n", + " driver: intake.source.csv.CSVSource\n", + " metadata: {}\n", + "\n" + ] + } + ], + "source": [ + "with open(new_cat_fname , 'w') as cat_out:\n", + " constructed_catalog = intake.open_csv(fsspec.open_local(rotors_path))\n", + " constructed_catalog.description = '''Tabular dataset with observed and simulated weather data, \n", + " intended for use training a machine learning model predicting turbulent wind gust events.'''\n", + " catalog_txt = constructed_catalog.yaml()\n", + " print(catalog_txt)\n", + " cat_out.write(catalog_txt)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 138, + "id": "a9b92c4f-bd0d-462d-9071-43ef5a81b503", + "metadata": {}, + "outputs": [ + { + "data": { + "application/yaml": "my_catalog:\n args:\n path: my_catalog.yml\n description: ''\n driver: intake.catalog.local.YAMLFileCatalog\n metadata: {}\n", + "text/plain": [ + "my_catalog:\n", + " args:\n", + " path: my_catalog.yml\n", + " description: ''\n", + " driver: intake.catalog.local.YAMLFileCatalog\n", + " metadata: {}\n" + ] + }, + "metadata": { + "application/json": { + "root": "my_catalog" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "new_catalog = intake.open_catalog(new_cat_fname)\n", + "new_catalog" + ] + }, + { + "cell_type": "code", + "execution_count": 139, + "id": "164ddf43-f404-41a7-8cb1-1084c17e53d0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -5473,119 +6890,206 @@ " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
Unnamed: 0DTGair_temp_obsdewpoint_obswind_direction_obswind_speed_obswind_gust_obsair_temp_1air_temp_2air_temp_3...windspd_18winddir_19windspd_19winddir_20windspd_20winddir_21windspd_21winddir_22windspd_22rotors_present
...................................................................................................................................................................................................00NaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaN0.0
1101/01/2015 00:00283.9280.7110.04.1-9999999.0284.000283.625283.250...5.8341.06.0334.06.1330.06.0329.05.80.0
2201/01/2015 03:00280.7279.790.07.7-9999999.0281.500281.250280.750...6.8344.05.3348.03.8360.03.212.03.50.0
3301/01/2015 06:00279.8278.1100.07.7-9999999.0279.875279.625279.125...6.0345.05.5358.05.010.04.238.04.00.0
4401/01/2015 09:00279.9277.0120.07.2-9999999.0279.625279.250278.875...3.1338.03.5354.03.99.04.422.04.60.0
.....................
201012010131/12/2020 06:00276.7275.5270.03.6-9999999.0277.875277.750277.625...12.1223.011.8221.011.4219.011.3215.011.40.0
201022010231/12/2020 09:00277.9276.9270.03.1-9999999.0277.875277.625277.875...10.2230.010.8230.011.6227.012.3222.012.00.0
201032010331/12/2020 12:00283.5277.1220.03.6-9999999.0281.125280.625280.125...10.3218.011.9221.012.8222.011.9225.010.60.0
201042010431/12/2020 15:00286.1276.9250.03.6-9999999.0284.625284.125283.625...9.4218.08.6212.08.3218.08.7226.010.10.0
201052010501/01/2021 00:00285.1279.3300.06.2-9999999.0284.250284.000283.750...8.6241.010.2236.010.5232.010.5227.011.30.0
\n", - "
\n", - "
Dask Name: getitem, 22 tasks
" + "

20106 rows × 96 columns

\n", + "" ], "text/plain": [ - "Dask DataFrame Structure:\n", - " Unnamed: 0 DTG air_temp_obs dewpoint_obs wind_direction_obs wind_speed_obs wind_gust_obs air_temp_1 air_temp_2 air_temp_3 air_temp_4 air_temp_5 air_temp_6 air_temp_7 air_temp_8 air_temp_9 air_temp_10 air_temp_11 air_temp_12 air_temp_13 air_temp_14 air_temp_15 air_temp_16 air_temp_17 air_temp_18 air_temp_19 air_temp_20 air_temp_21 air_temp_22 sh_1 sh_2 sh_3 sh_4 sh_5 sh_6 sh_7 sh_8 sh_9 sh_10 sh_11 sh_12 sh_13 sh_14 sh_15 sh_16 sh_17 sh_18 sh_19 sh_20 sh_21 sh_22 winddir_1 windspd_1 winddir_2 windspd_2 winddir_3 windspd_3 winddir_4 windspd_4 winddir_5 windspd_5 winddir_6 windspd_6 winddir_7 windspd_7 winddir_8 windspd_8 winddir_9 windspd_9 winddir_10 windspd_10 winddir_11 windspd_11 winddir_12 windspd_12 winddir_13 windspd_13 winddir_14 windspd_14 winddir_15 windspd_15 winddir_16 windspd_16 winddir_17 windspd_17 winddir_18 windspd_18 winddir_19 windspd_19 winddir_20 windspd_20 winddir_21 windspd_21 winddir_22 windspd_22 rotors_present\n", - "npartitions=1 \n", - " int64 datetime64[ns] float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64\n", - " ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", - "Dask Name: getitem, 22 tasks" + " Unnamed: 0 DTG air_temp_obs dewpoint_obs \\\n", + "0 0 NaN NaN NaN \n", + "1 1 01/01/2015 00:00 283.9 280.7 \n", + "2 2 01/01/2015 03:00 280.7 279.7 \n", + "3 3 01/01/2015 06:00 279.8 278.1 \n", + "4 4 01/01/2015 09:00 279.9 277.0 \n", + "... ... ... ... ... \n", + "20101 20101 31/12/2020 06:00 276.7 275.5 \n", + "20102 20102 31/12/2020 09:00 277.9 276.9 \n", + "20103 20103 31/12/2020 12:00 283.5 277.1 \n", + "20104 20104 31/12/2020 15:00 286.1 276.9 \n", + "20105 20105 01/01/2021 00:00 285.1 279.3 \n", + "\n", + " wind_direction_obs wind_speed_obs wind_gust_obs air_temp_1 \\\n", + "0 NaN NaN NaN NaN \n", + "1 110.0 4.1 -9999999.0 284.000 \n", + "2 90.0 7.7 -9999999.0 281.500 \n", + "3 100.0 7.7 -9999999.0 279.875 \n", + "4 120.0 7.2 -9999999.0 279.625 \n", + "... ... ... ... ... \n", + "20101 270.0 3.6 -9999999.0 277.875 \n", + "20102 270.0 3.1 -9999999.0 277.875 \n", + "20103 220.0 3.6 -9999999.0 281.125 \n", + "20104 250.0 3.6 -9999999.0 284.625 \n", + "20105 300.0 6.2 -9999999.0 284.250 \n", + "\n", + " air_temp_2 air_temp_3 ... windspd_18 winddir_19 windspd_19 \\\n", + "0 NaN NaN ... NaN NaN NaN \n", + "1 283.625 283.250 ... 5.8 341.0 6.0 \n", + "2 281.250 280.750 ... 6.8 344.0 5.3 \n", + "3 279.625 279.125 ... 6.0 345.0 5.5 \n", + "4 279.250 278.875 ... 3.1 338.0 3.5 \n", + "... ... ... ... ... ... ... \n", + "20101 277.750 277.625 ... 12.1 223.0 11.8 \n", + "20102 277.625 277.875 ... 10.2 230.0 10.8 \n", + "20103 280.625 280.125 ... 10.3 218.0 11.9 \n", + "20104 284.125 283.625 ... 9.4 218.0 8.6 \n", + "20105 284.000 283.750 ... 8.6 241.0 10.2 \n", + "\n", + " winddir_20 windspd_20 winddir_21 windspd_21 winddir_22 windspd_22 \\\n", + "0 NaN NaN NaN NaN NaN NaN \n", + "1 334.0 6.1 330.0 6.0 329.0 5.8 \n", + "2 348.0 3.8 360.0 3.2 12.0 3.5 \n", + "3 358.0 5.0 10.0 4.2 38.0 4.0 \n", + "4 354.0 3.9 9.0 4.4 22.0 4.6 \n", + "... ... ... ... ... ... ... \n", + "20101 221.0 11.4 219.0 11.3 215.0 11.4 \n", + "20102 230.0 11.6 227.0 12.3 222.0 12.0 \n", + "20103 221.0 12.8 222.0 11.9 225.0 10.6 \n", + "20104 212.0 8.3 218.0 8.7 226.0 10.1 \n", + "20105 236.0 10.5 232.0 10.5 227.0 11.3 \n", + "\n", + " rotors_present \n", + "0 0.0 \n", + "1 0.0 \n", + "2 0.0 \n", + "3 0.0 \n", + "4 0.0 \n", + "... ... \n", + "20101 0.0 \n", + "20102 0.0 \n", + "20103 0.0 \n", + "20104 0.0 \n", + "20105 0.0 \n", + "\n", + "[20106 rows x 96 columns]" ] }, - "execution_count": 31, + "execution_count": 139, "metadata": {}, "output_type": "execute_result" } ], - "source": [ - "rotors_df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "90bc1b70-135a-4601-a85c-49a8fa5723cb", - "metadata": {}, - "outputs": [], - "source": [ - "new_cat_fname = 'my_catalog.yml'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "49690bf4-ece7-44ab-89c1-dfb91a653982", - "metadata": {}, - "outputs": [], - "source": [ - "intake.open_csv(fsspec.open_local(rotors_path)).read()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "755ec6a0-c43f-4dbf-a0d6-84e5f579fac8", - "metadata": {}, - "outputs": [], - "source": [ - "cat1 = intake.catalog.()\n", - "cat1" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "960dbaa4-e4b4-420c-a9b6-600b900a6ce2", - "metadata": {}, - "outputs": [], - "source": [ - "cat1.name = 'rotors'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b2d12385-743f-4297-9b4b-c3afccef6a76", - "metadata": {}, - "outputs": [], - "source": [ - "intake.source.csv.CSVSource(fsspec.open_local(rotors_path)).yaml()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0da242f4-cd39-461b-bff2-f5ad748f5b6a", - "metadata": {}, - "outputs": [], - "source": [ - "with open(new_cat_fname , 'w') as cat_out:\n", - " constructed_catalog = intake.open_csv(fsspec.open_local(rotors_path))\n", - " constructed_catalog.description = '''Tabular dataset with observed and simulated weather data, \n", - " intended for use training a machine learning model predicting turbulent wind gust events.'''\n", - " catalog_txt = constructed_catalog.yaml()\n", - " print(catalog_txt)\n", - " cat_out.write(catalog_txt)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a9b92c4f-bd0d-462d-9071-43ef5a81b503", - "metadata": {}, - "outputs": [], - "source": [ - "new_catalog = intake.open_catalog(new_cat_fname)\n", - "new_catalog" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "164ddf43-f404-41a7-8cb1-1084c17e53d0", - "metadata": {}, - "outputs": [], "source": [ "new_catalog['csv'].read()" ] @@ -5605,7 +7109,15 @@ "id": "b557aad3-d2f0-4fef-86c5-93d7b6f67bc4", "metadata": {}, "source": [ - "### Next Steps and further reading" + "### Next Steps and further reading\n", + "\n", + "Here we have seen how we can use assorted tools an practices to make the data preparation part of a machine learning project more scalable and reproducible, and also make the datasets created more FAIR through the use of catalogs to abstract away the details of data handling from a researcher who doesn't and shouldn't need to care about the technical details. We have used these tools in quite basic way, there are many more advanced options and tools that implement the same principles. Similarly there examples are very weather and climate focused as well as python centric. Different tools or languages will be used for different research areas (e.g. R and R Shiny may be popular)\n", + "\n", + "* Scaling data pipelines with dask - [docs](https://www.udemy.com/course/scalable-data-analysis-in-python-with-dask/) [Udemy course](https://www.udemy.com/course/scalable-data-analysis-in-python-with-dask/)\n", + " * Machine Learning with dask - [docs](https://ml.dask.org/)\n", + "* Building a dashboard with Holoviz - [tutorial](https://holoviz.org/tutorial/exercises/Building_a_Dashboard.html)\n", + " * Building a dashboard in R with Shiny - [docs](https://shiny.rstudio.com/articles/dashboards.html)\n", + "* Suppporting FAIR data - [European Commission report](https://ec.europa.eu/info/sites/default/files/turning_fair_into_reality_0.pdf)" ] }, { @@ -5615,8 +7127,12 @@ "source": [ "### References\n", "\n", - "* [Ray]()\n", - "* [Holoviz](https://holoviz.org/)" + "* [Pandas](https://pandas.pydata.org/docs/)\n", + "* [dask](https://docs.dask.org/en/stable/)\n", + "* [Iris](https://scitools-iris.readthedocs.io/en/stable/)\n", + "* [Holoviz](https://holoviz.org/)\n", + "* [intake](https://intake.readthedocs.io/en/latest/)\n", + "* [fsspec](https://filesystem-spec.readthedocs.io/en/latest/)\n" ] } ],