From 9f9732604d15e0c48c210eedd67cb25025aade79 Mon Sep 17 00:00:00 2001 From: Sam Hollings <52575338+SamHollings@users.noreply.github.com> Date: Thu, 16 May 2024 16:26:50 +0000 Subject: [PATCH 1/8] Added a notebook which has the main pipeline code and run in colab button --- .gitignore | 1 + rap_example_pipeline_python.ipynb | 132 ++++++++++++++++++++++++++++++ 2 files changed, 133 insertions(+) create mode 100644 rap_example_pipeline_python.ipynb diff --git a/.gitignore b/.gitignore index 292d171..cbd5120 100644 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,7 @@ !tests/backtests/ground_truth/*.csv *.ipynb +!rap_example_pipeline_python.ipynb *.ipynb_checkpoints *.xlsx *.xls diff --git a/rap_example_pipeline_python.ipynb b/rap_example_pipeline_python.ipynb new file mode 100644 index 0000000..3f963e0 --- /dev/null +++ b/rap_example_pipeline_python.ipynb @@ -0,0 +1,132 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# RAP Example Python Pipeline - Interactive Exercise" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Purpose of the script: to provide an example of good practices when structuring a pipeline using PySpark\n", + "\n", + "The script loads Python packages but also internal modules (e.g. modules.helpers, helpers script from the modules folder).\n", + "It then loads various configuration variables and a logger, for more info on see the RAP Community of Practice website:\n", + "https://nhsdigital.github.io/rap-community-of-practice/\n", + "\n", + "Most of the code to carry out this configuration and setup is found in the utils folder.\n", + "\n", + "Then, the main pipeline itself begins, which has three phases:\n", + "\n", + "data_ingestion: \n", + " we download the artificial hes data, load it into a spark dataframe. Any other cleaning or preprocessing should\n", + " happen at this stage\n", + "processing: \n", + " we process the data as needed, in this case we create some aggregate counts based on the hes data\n", + "data_exports: \n", + " finally we write our outputs to an appropriate file type (CSV)\n", + "\n", + "Note that in the src folder, each of these phases has its own folder, to neatly organise the code used for each one." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# this part imports our Python packages, pyspark functions, and our project's own modules\n", + "import logging\n", + "import timeit \n", + "from datetime import datetime \n", + "\n", + "from pyspark.sql import functions as F\n", + "\n", + "from src.utils import file_paths\n", + "from src.utils import logging_config\n", + "from src.utils import spark as spark_utils\n", + "from src.data_ingestion import get_data\n", + "from src.data_ingestion import reading_data\n", + "from src.processing import aggregate_counts\n", + "from src.data_exports import write_csv\n", + "\n", + "logger = logging.getLogger(__name__)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def main():\n", + " \n", + " # load config, here we load our project's parameters from the config.toml file\n", + " config = file_paths.get_config() \n", + "\n", + " # configure logging\n", + " logging_config.configure_logging(config['log_dir'])\n", + " logger.info(f\"Configured logging with log folder: {config['log_dir']}.\")\n", + " logger.info(f\"Logging the config settings:\\n\\n\\t{config}\\n\")\n", + " logger.info(f\"Starting run at:\\t{datetime.now().time()}\")\n", + "\n", + " # get artificial HES data as CSV\n", + " get_data.download_zip_from_url(config['data_url'], overwrite=True)\n", + " logger.info(f\"Downloaded artificial hes as zip.\")\n", + "\n", + " # create spark session\n", + " spark = spark_utils.create_spark_session(config['project_name'])\n", + " logger.info(f\"created spark session with app name: {config['project_name']}\")\n", + "\n", + " # Loading data from CSV as spark data frame\n", + " df_hes_data = reading_data.load_csv_into_spark_data_frame(spark, config['path_to_downloaded_data'])\n", + "\n", + " # Creating dictionary to hold outputs\n", + " outputs = {}\n", + "\n", + " # Count number of episodes in England - place this in the outputs dictionary\n", + " outputs[\"df_hes_england_count\"] = aggregate_counts.get_distinct_count(df_hes_data, 'epikey', 'number_of_episodes')\n", + "\n", + " # Rename and save spark dataframes as CSVs:\n", + " for output_name, output in outputs.items():\n", + " write_csv.save_spark_dataframe_as_csv(output, output_name)\n", + " logger.info(f\"saved output df {output_name} as csv\")\n", + " write_csv.rename_csv_output(output_name)\n", + " logger.info(f\"renamed {output_name} file\")\n", + " \n", + " # stop the spark session\n", + " spark.stop()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + " print(f\"Running create_publication script\")\n", + " start_time = timeit.default_timer()\n", + " main()\n", + " total_time = timeit.default_timer() - start_time\n", + " logger.info(f\"Running time of create_publication script: {int(total_time / 60)} minutes and {round(total_time%60)} seconds.\\n\")" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From b19645b1117597b31ab535c2af9cc1671496c325 Mon Sep 17 00:00:00 2001 From: Sam Hollings <52575338+SamHollings@users.noreply.github.com> Date: Tue, 28 May 2024 12:39:44 +0000 Subject: [PATCH 2/8] Made the notebook use a pandas pipeline and updated HES files location --- config.toml | 2 +- rap_example_pipeline_python.ipynb | 143 +++++++++++++++++++++--------- requirements.txt | 18 ++-- 3 files changed, 111 insertions(+), 52 deletions(-) diff --git a/config.toml b/config.toml index c7d7543..21c7a8f 100644 --- a/config.toml +++ b/config.toml @@ -1,6 +1,6 @@ project_name = "example_pipeline_pyspark_version" -data_url = "https://s3.eu-west-2.amazonaws.com/files.digital.nhs.uk/assets/Services/Artificial+data/Artificial+HES+final/artificial_hes_ae_202302_v1_sample.zip" +data_url = "https://files.digital.nhs.uk/assets/Services/Artificial%20data/Artificial%20HES%20final/artificial_hes_ae_202302_v1_sample.zip" path_to_downloaded_data = "data_in/artificial_hes_ae_202302_v1_sample.zip/artificial_hes_ae_202302_v1_sample/artificial_hes_ae_2122.csv" # Here we describe where the output and logs are saved, change as necessary diff --git a/rap_example_pipeline_python.ipynb b/rap_example_pipeline_python.ipynb index 3f963e0..158245e 100644 --- a/rap_example_pipeline_python.ipynb +++ b/rap_example_pipeline_python.ipynb @@ -39,6 +39,13 @@ "Note that in the src folder, each of these phases has its own folder, to neatly organise the code used for each one." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup" + ] + }, { "cell_type": "code", "execution_count": null, @@ -63,49 +70,33 @@ "logger = logging.getLogger(__name__)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Config" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "def main():\n", - " \n", - " # load config, here we load our project's parameters from the config.toml file\n", - " config = file_paths.get_config() \n", - "\n", - " # configure logging\n", - " logging_config.configure_logging(config['log_dir'])\n", - " logger.info(f\"Configured logging with log folder: {config['log_dir']}.\")\n", - " logger.info(f\"Logging the config settings:\\n\\n\\t{config}\\n\")\n", - " logger.info(f\"Starting run at:\\t{datetime.now().time()}\")\n", - "\n", - " # get artificial HES data as CSV\n", - " get_data.download_zip_from_url(config['data_url'], overwrite=True)\n", - " logger.info(f\"Downloaded artificial hes as zip.\")\n", - "\n", - " # create spark session\n", - " spark = spark_utils.create_spark_session(config['project_name'])\n", - " logger.info(f\"created spark session with app name: {config['project_name']}\")\n", - "\n", - " # Loading data from CSV as spark data frame\n", - " df_hes_data = reading_data.load_csv_into_spark_data_frame(spark, config['path_to_downloaded_data'])\n", - "\n", - " # Creating dictionary to hold outputs\n", - " outputs = {}\n", - "\n", - " # Count number of episodes in England - place this in the outputs dictionary\n", - " outputs[\"df_hes_england_count\"] = aggregate_counts.get_distinct_count(df_hes_data, 'epikey', 'number_of_episodes')\n", - "\n", - " # Rename and save spark dataframes as CSVs:\n", - " for output_name, output in outputs.items():\n", - " write_csv.save_spark_dataframe_as_csv(output, output_name)\n", - " logger.info(f\"saved output df {output_name} as csv\")\n", - " write_csv.rename_csv_output(output_name)\n", - " logger.info(f\"renamed {output_name} file\")\n", - " \n", - " # stop the spark session\n", - " spark.stop()" + "config = file_paths.get_config() \n", + "\n", + "# configure logging\n", + "logging_config.configure_logging(config['log_dir'])\n", + "logger.info(f\"Configured logging with log folder: {config['log_dir']}.\")\n", + "logger.info(f\"Logging the config settings:\\n\\n\\t{config}\\n\")\n", + "logger.info(f\"Starting run at:\\t{datetime.now().time()}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load Data" ] }, { @@ -114,17 +105,83 @@ "metadata": {}, "outputs": [], "source": [ - " print(f\"Running create_publication script\")\n", - " start_time = timeit.default_timer()\n", - " main()\n", - " total_time = timeit.default_timer() - start_time\n", - " logger.info(f\"Running time of create_publication script: {int(total_time / 60)} minutes and {round(total_time%60)} seconds.\\n\")" + "# get artificial HES data as CSV\n", + "get_data.download_zip_from_url(config['data_url'], overwrite=True)\n", + "logger.info(f\"Downloaded artificial hes as zip.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "df_hes_data = pd.read_csv(config['path_to_downloaded_data'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_distinct_count(df: pd.DataFrame, col_to_aggregate: str) -> int:\n", + " \"\"\"Returns the number of distinct values in a column of a pandas DataFrame.\"\"\"\n", + " return df[col_to_aggregate].nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Creating dictionary to hold outputs\n", + "outputs = {}\n", + "\n", + "# Count number of episodes in England - place this in the outputs dictionary\n", + "outputs[\"df_hes_england_count\"] = get_distinct_count(df_hes_data, 'EPIKEY')\n", + "\n", + "# Rename and save spark dataframes as CSVs:\n", + "for output_name, output in outputs.items():\n", + "\n", + " import pandas as pd\n", + "\n", + " # Create a DataFrame with the integer value\n", + " df_output = pd.DataFrame({'england_count': [outputs[\"df_hes_england_count\"]]})\n", + "\n", + " # prep the filepath and ensure the directory exists\n", + " from pathlib import Path\n", + " output_file = 'my_file.csv'\n", + " output_dir = Path(f'data_out/{output_name}')\n", + " output_dir.mkdir(parents=True, exist_ok=True)\n", + " output_filename = output_dir /f'{output_name}.csv'\n", + "\n", + " # Save the DataFrame to a CSV file\n", + " df_output.to_csv(output_filename, index=False)\n", + " logger.info(f\"saved output df to {output_filename}\")" ] } ], "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, "language_info": { - "name": "python" + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" } }, "nbformat": 4, diff --git a/requirements.txt b/requirements.txt index a67ad87..9b7167b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ # See https://nhsd-git.digital.nhs.uk/data-services/analytics-service/iuod/rap-community-of-practice/-/blob/master/python/project-structure-and-packaging.md # PySpark -pyspark==3.2.1 +pyspark # Requests - used to collect data requests @@ -10,22 +10,24 @@ requests # Python version = 3.10.* # Data manipulation -numpy==1.21.5 -pandas==1.3.5 +numpy +pandas # SQL connections -pyodbc==4.0.35 -sqlalchemy==1.4.46 +pyodbc +sqlalchemy # Excel output #openpyxl==3.0.9 # Testing -pytest==6.2.5 -pytest-html==3.1.1 +pytest +pytest-html # Dependencies of the above packages #ipykernel==6.9.0 #nbformat==5.1.3 -toml==0.10.2 +toml #pathlib2==2.3.6 + +jupyter From 85b123de4d36d69e05c10fd6cbf51f84e768fe3b Mon Sep 17 00:00:00 2001 From: Sam Hollings <52575338+SamHollings@users.noreply.github.com> Date: Mon, 10 Jun 2024 11:53:48 +0000 Subject: [PATCH 3/8] Added some more to the into to python notebook --- rap_example_pipeline_python.ipynb | 484 +++++++++++++++++++++++++++--- 1 file changed, 442 insertions(+), 42 deletions(-) diff --git a/rap_example_pipeline_python.ipynb b/rap_example_pipeline_python.ipynb index 158245e..0bac70a 100644 --- a/rap_example_pipeline_python.ipynb +++ b/rap_example_pipeline_python.ipynb @@ -11,32 +11,27 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "" + "ADD THE GOOGLE COLAB LINK HERE" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Purpose of the script: to provide an example of good practices when structuring a pipeline using PySpark\n", + "## Intro\n", "\n", - "The script loads Python packages but also internal modules (e.g. modules.helpers, helpers script from the modules folder).\n", - "It then loads various configuration variables and a logger, for more info on see the RAP Community of Practice website:\n", - "https://nhsdigital.github.io/rap-community-of-practice/\n", + "This notebook will show you how straight-forward it is to do an analytical pipeline in Python.\n", "\n", - "Most of the code to carry out this configuration and setup is found in the utils folder.\n", + "The core of any of piece of analytical work is to:\n", + "- load some data\n", + "- do something to do that, e.g. process it, do some analysis\n", + "- create some output\n", "\n", - "Then, the main pipeline itself begins, which has three phases:\n", + "This notebook will go briefly through each of these showing *one* way of doing it in Python (there are many more!). \n", "\n", - "data_ingestion: \n", - " we download the artificial hes data, load it into a spark dataframe. Any other cleaning or preprocessing should\n", - " happen at this stage\n", - "processing: \n", - " we process the data as needed, in this case we create some aggregate counts based on the hes data\n", - "data_exports: \n", - " finally we write our outputs to an appropriate file type (CSV)\n", + "Open this notebook in google colab and have a play - try changing bits and see what happens!\n", "\n", - "Note that in the src folder, each of these phases has its own folder, to neatly organise the code used for each one." + "**NOTE**: to make the workshop more straight forward, we haven't completely followed good practice. If you want to see a pipeline how it should be, well laid out and modularised, [see our Example Python pipeline](https://github.com/NHSDigital/RAP_example_pipeline_python)." ] }, { @@ -46,28 +41,55 @@ "## Setup" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will need to install a few things before we can get going.\n", + "\n", + "First, if this is running in Google Colab, we need to clone the repo and install the right python packages." + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ - "# this part imports our Python packages, pyspark functions, and our project's own modules\n", - "import logging\n", - "import timeit \n", - "from datetime import datetime \n", - "\n", - "from pyspark.sql import functions as F\n", + "# this forces google collab to install the dependencies\n", + "if \"google.colab\" in str(get_ipython()):\n", + " print(\"Running on Colab\")\n", + " !git clone https://github.com/NHSDigital/RAP_example_pipeline_python.git -q\n", + " %cd RAP_example_pipeline_python\n", + " !pip install -r requirements.txt -q -q" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next we need to import the right libraries for this piece of work:" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "import logging # this allows us to write log messages helping with any future auditing and debugging\n", + "import timeit # this allows us to time the execution of the code\n", + "from datetime import datetime # this allows us to work with dates and times\n", + "import pandas as pd # this allows us to work with dataframes\n", "\n", + "# these are the modules we have created to help us with the pipeline\n", "from src.utils import file_paths\n", "from src.utils import logging_config\n", "from src.utils import spark as spark_utils\n", - "from src.data_ingestion import get_data\n", - "from src.data_ingestion import reading_data\n", + "import src.data_ingestion\n", "from src.processing import aggregate_counts\n", "from src.data_exports import write_csv\n", - "\n", - "logger = logging.getLogger(__name__)" + "\n" ] }, { @@ -77,15 +99,45 @@ "## Config" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It's important that we don't hardcode things which can change into the code - instead we keep things like that in config files.\n", + "\n", + "An example is where the data is to be picked up from and where any outputs will be saved to: these will change from when you are working in \"dev\" to when the code is finalised and put into \"production\"." + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ - "config = file_paths.get_config() \n", - "\n", - "# configure logging\n", + "config = file_paths.get_config() " + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2024-06-10 11:42:34,860 - INFO -- 1487816006.py: ():4 -- Configured logging with log folder: .\n", + "2024-06-10 11:42:34,861 - INFO -- 1487816006.py: ():5 -- Logging the config settings:\n", + "\n", + "\t{'project_name': 'example_pipeline_pyspark_version', 'data_url': 'https://files.digital.nhs.uk/assets/Services/Artificial%20data/Artificial%20HES%20final/artificial_hes_ae_202302_v1_sample.zip', 'path_to_downloaded_data': 'data_in/artificial_hes_ae_202302_v1_sample.zip/artificial_hes_ae_202302_v1_sample/artificial_hes_ae_2122.csv', 'output_dir': '', 'log_dir': ''}\n", + "\n", + "2024-06-10 11:42:34,862 - INFO -- 1487816006.py: ():6 -- Starting run at:\t11:42:34.862941\n" + ] + } + ], + "source": [ + "# initialise and configure logging\n", + "logger = logging.getLogger(__name__)\n", "logging_config.configure_logging(config['log_dir'])\n", "logger.info(f\"Configured logging with log folder: {config['log_dir']}.\")\n", "logger.info(f\"Logging the config settings:\\n\\n\\t{config}\\n\")\n", @@ -99,44 +151,385 @@ "## Load Data" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First we will load the data: we're going to use an artificial fake version of the NHS Hospital Episode Statistics Accident and Emergency (HES AE) data from 2003. \n", + "\n", + "We've hidden all the complexity of aquiring the data away in a function - called \"get_data\". This is good practice, because:\n", + "\n", + "1. this data might be used many times in many different pipelines - this function can be reused, saving your colleagues time\n", + "2. the way the data is acquired might change, e.g. in different platforms, to accomodate this we only need to add to, change or improve this function - your downstream pipeline should continue as normal\n", + "\n", + "This function:\n", + "- gets the location of the data from the config file\n", + "- downloads the CSV\n", + "- loads that CSV into a pandas dataframe in memory\n", + "\n", + "This is just an example - in another setting we could make it load the data from a SQL server, or from a database, S3 bucket, etc." + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ - "# get artificial HES data as CSV\n", - "get_data.download_zip_from_url(config['data_url'], overwrite=True)\n", - "logger.info(f\"Downloaded artificial hes as zip.\")" + "def get_data(config):\n", + " \"\"\"Get the data from the data source and return it as a pandas dataframe\n", + " \n", + " Args:\n", + " config (dict): the configuration dictionary\n", + "\n", + " Returns:\n", + " pandas dataframe: the data \n", + " \"\"\"\n", + "\n", + " # get the data location from the config\n", + " data_location = config['data_url']\n", + " print(\"the data came from here: \", data_location) # let's print the location so you can see where it is stored - it's a publicly available zip.\n", + "\n", + " # download the CSV file\n", + " src.data_ingestion.get_data.download_zip_from_url(data_location, overwrite=True)\n", + " logger.info(f\"Downloaded data as zip.\")\n", + "\n", + " # read the CSV file into a pandas dataframe\n", + " df_data = pd.read_csv(config['path_to_downloaded_data'])\n", + "\n", + " return df_data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we will use our get data function to... get the data! Look how simple it makes the code below to read - it does what it says on the tin" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "the data came from here: https://files.digital.nhs.uk/assets/Services/Artificial%20data/Artificial%20HES%20final/artificial_hes_ae_202302_v1_sample.zip\n", + "2024-06-10 11:42:38,509 - INFO -- 734666532.py: get_data():10 -- Downloaded data as zip.\n" + ] + } + ], + "source": [ + "df_hes_data = get_data(config)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's see what this data looks like, and pull the first 5 rows:" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
FYEARPARTYEARPSEUDO_HESIDAEKEYAEKEY_FLAGAEARRIVALMODEAEATTEND_EXC_PLANNEDAEATTENDCATAEATTENDDISPAEDEPTTYPE...LSOA11MSOA11PROVDISTPROVDIST_FLAGNER_GP_PRACTICENER_RESIDENCENER_TREATMENTSITETRETSITEDISTSITEDIST_FLAG
02122202103TESTqPNh7HEHdm1sB5QlvVaSQZS7BekK910587081231121131...E01000385E0200176819.373.0QKSQKSQHMRW6014.895.0
12122202103TESTqPNh7HEHdm1sB5QlvVaSQZS7BekK747777461989121131...E01030571E020048333.963.0QMJQYGQKSRY9011.215.0
22122202103TESTqPNh7HEHdm1sB5QlvVaSQZS7BekK244053969711121133...E01008938E02005828NaN3.0QWEQKKQWORJC0215.165.0
32122202103TESTqPNh7HEHdm1sB5QlvVaSQZS7BekK425257514835121111...E01030533E0200091223.683.0QMJQRVQOPRJE073.165.0
42122202103TESTqPNh7HEHdm1sB5QlvVaSQZS7BekK892001219292121131...E01025434E020043155.853.0QMFQM7QMJRDE032.415.0
\n", + "

5 rows × 165 columns

\n", + "
" + ], + "text/plain": [ + " FYEAR PARTYEAR PSEUDO_HESID AEKEY \\\n", + "0 2122 202103 TESTqPNh7HEHdm1sB5QlvVaSQZS7BekK 910587081231 \n", + "1 2122 202103 TESTqPNh7HEHdm1sB5QlvVaSQZS7BekK 747777461989 \n", + "2 2122 202103 TESTqPNh7HEHdm1sB5QlvVaSQZS7BekK 244053969711 \n", + "3 2122 202103 TESTqPNh7HEHdm1sB5QlvVaSQZS7BekK 425257514835 \n", + "4 2122 202103 TESTqPNh7HEHdm1sB5QlvVaSQZS7BekK 892001219292 \n", + "\n", + " AEKEY_FLAG AEARRIVALMODE AEATTEND_EXC_PLANNED AEATTENDCAT AEATTENDDISP \\\n", + "0 1 2 1 1 3 \n", + "1 1 2 1 1 3 \n", + "2 1 2 1 1 3 \n", + "3 1 2 1 1 1 \n", + "4 1 2 1 1 3 \n", + "\n", + " AEDEPTTYPE ... LSOA11 MSOA11 PROVDIST PROVDIST_FLAG \\\n", + "0 1 ... E01000385 E02001768 19.37 3.0 \n", + "1 1 ... E01030571 E02004833 3.96 3.0 \n", + "2 3 ... E01008938 E02005828 NaN 3.0 \n", + "3 1 ... E01030533 E02000912 23.68 3.0 \n", + "4 1 ... E01025434 E02004315 5.85 3.0 \n", + "\n", + " NER_GP_PRACTICE NER_RESIDENCE NER_TREATMENT SITETRET SITEDIST \\\n", + "0 QKS QKS QHM RW601 4.89 \n", + "1 QMJ QYG QKS RY901 1.21 \n", + "2 QWE QKK QWO RJC02 15.16 \n", + "3 QMJ QRV QOP RJE07 3.16 \n", + "4 QMF QM7 QMJ RDE03 2.41 \n", + "\n", + " SITEDIST_FLAG \n", + "0 5.0 \n", + "1 5.0 \n", + "2 5.0 \n", + "3 5.0 \n", + "4 5.0 \n", + "\n", + "[5 rows x 165 columns]" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_hes_data.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Processing" + ] + }, + { + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "import pandas as pd\n", + "Now the fun part - we get to do some interesting processing on the data.\n", "\n", - "df_hes_data = pd.read_csv(config['path_to_downloaded_data'])" + "The simplest piece of processing you might do is simply get a distinct count on one of the columns. \n", + "\n", + "Again, we create a function to do this - for a very small bit of processing like this it might not make a lot of sense, but if you were doing a larger derivation that might feasibly be used in other work, it could really save someone else some time!" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "def get_distinct_count(df: pd.DataFrame, col_to_aggregate: str) -> int:\n", - " \"\"\"Returns the number of distinct values in a column of a pandas DataFrame.\"\"\"\n", + " \"\"\"Returns the number of distinct values in a column of a pandas DataFrame.\n", + " \n", + " Args:\n", + " df (pd.DataFrame): the pandas DataFrame\n", + " col_to_aggregate (str): the column to aggregate\n", + "\n", + " Returns:\n", + " int: the number of distinct values\n", + " \"\"\"\n", " return df[col_to_aggregate].nunique()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's run our simple analysis and print the result:" + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Distinct EPIKEY count: 10000\n" + ] + } + ], + "source": [ + "distinct_epikey_count = get_distinct_count(df_hes_data, 'EPIKEY')\n", + "print(f\"Distinct EPIKEY count: {distinct_epikey_count}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2024-05-28 14:18:00,084 - INFO -- 1024386828.py: ():24 -- saved output df to data_out/df_hes_england_count/df_hes_england_count.csv\n" + ] + } + ], "source": [ "# Creating dictionary to hold outputs\n", "outputs = {}\n", @@ -163,6 +556,13 @@ " df_output.to_csv(output_filename, index=False)\n", " logger.info(f\"saved output df to {output_filename}\")" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { From 9b80cabdbd7599254989274655b98163cb9d0a26 Mon Sep 17 00:00:00 2001 From: Sam Hollings <52575338+SamHollings@users.noreply.github.com> Date: Mon, 10 Jun 2024 11:54:27 +0000 Subject: [PATCH 4/8] cleared outputs --- rap_example_pipeline_python.ipynb | 282 ++---------------------------- 1 file changed, 15 insertions(+), 267 deletions(-) diff --git a/rap_example_pipeline_python.ipynb b/rap_example_pipeline_python.ipynb index 0bac70a..0d6996a 100644 --- a/rap_example_pipeline_python.ipynb +++ b/rap_example_pipeline_python.ipynb @@ -52,7 +52,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -73,7 +73,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -110,7 +110,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -119,22 +119,9 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-06-10 11:42:34,860 - INFO -- 1487816006.py: ():4 -- Configured logging with log folder: .\n", - "2024-06-10 11:42:34,861 - INFO -- 1487816006.py: ():5 -- Logging the config settings:\n", - "\n", - "\t{'project_name': 'example_pipeline_pyspark_version', 'data_url': 'https://files.digital.nhs.uk/assets/Services/Artificial%20data/Artificial%20HES%20final/artificial_hes_ae_202302_v1_sample.zip', 'path_to_downloaded_data': 'data_in/artificial_hes_ae_202302_v1_sample.zip/artificial_hes_ae_202302_v1_sample/artificial_hes_ae_2122.csv', 'output_dir': '', 'log_dir': ''}\n", - "\n", - "2024-06-10 11:42:34,862 - INFO -- 1487816006.py: ():6 -- Starting run at:\t11:42:34.862941\n" - ] - } - ], + "outputs": [], "source": [ "# initialise and configure logging\n", "logger = logging.getLogger(__name__)\n", @@ -172,7 +159,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -209,18 +196,9 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "the data came from here: https://files.digital.nhs.uk/assets/Services/Artificial%20data/Artificial%20HES%20final/artificial_hes_ae_202302_v1_sample.zip\n", - "2024-06-10 11:42:38,509 - INFO -- 734666532.py: get_data():10 -- Downloaded data as zip.\n" - ] - } - ], + "outputs": [], "source": [ "df_hes_data = get_data(config)" ] @@ -234,223 +212,9 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
FYEARPARTYEARPSEUDO_HESIDAEKEYAEKEY_FLAGAEARRIVALMODEAEATTEND_EXC_PLANNEDAEATTENDCATAEATTENDDISPAEDEPTTYPE...LSOA11MSOA11PROVDISTPROVDIST_FLAGNER_GP_PRACTICENER_RESIDENCENER_TREATMENTSITETRETSITEDISTSITEDIST_FLAG
02122202103TESTqPNh7HEHdm1sB5QlvVaSQZS7BekK910587081231121131...E01000385E0200176819.373.0QKSQKSQHMRW6014.895.0
12122202103TESTqPNh7HEHdm1sB5QlvVaSQZS7BekK747777461989121131...E01030571E020048333.963.0QMJQYGQKSRY9011.215.0
22122202103TESTqPNh7HEHdm1sB5QlvVaSQZS7BekK244053969711121133...E01008938E02005828NaN3.0QWEQKKQWORJC0215.165.0
32122202103TESTqPNh7HEHdm1sB5QlvVaSQZS7BekK425257514835121111...E01030533E0200091223.683.0QMJQRVQOPRJE073.165.0
42122202103TESTqPNh7HEHdm1sB5QlvVaSQZS7BekK892001219292121131...E01025434E020043155.853.0QMFQM7QMJRDE032.415.0
\n", - "

5 rows × 165 columns

\n", - "
" - ], - "text/plain": [ - " FYEAR PARTYEAR PSEUDO_HESID AEKEY \\\n", - "0 2122 202103 TESTqPNh7HEHdm1sB5QlvVaSQZS7BekK 910587081231 \n", - "1 2122 202103 TESTqPNh7HEHdm1sB5QlvVaSQZS7BekK 747777461989 \n", - "2 2122 202103 TESTqPNh7HEHdm1sB5QlvVaSQZS7BekK 244053969711 \n", - "3 2122 202103 TESTqPNh7HEHdm1sB5QlvVaSQZS7BekK 425257514835 \n", - "4 2122 202103 TESTqPNh7HEHdm1sB5QlvVaSQZS7BekK 892001219292 \n", - "\n", - " AEKEY_FLAG AEARRIVALMODE AEATTEND_EXC_PLANNED AEATTENDCAT AEATTENDDISP \\\n", - "0 1 2 1 1 3 \n", - "1 1 2 1 1 3 \n", - "2 1 2 1 1 3 \n", - "3 1 2 1 1 1 \n", - "4 1 2 1 1 3 \n", - "\n", - " AEDEPTTYPE ... LSOA11 MSOA11 PROVDIST PROVDIST_FLAG \\\n", - "0 1 ... E01000385 E02001768 19.37 3.0 \n", - "1 1 ... E01030571 E02004833 3.96 3.0 \n", - "2 3 ... E01008938 E02005828 NaN 3.0 \n", - "3 1 ... E01030533 E02000912 23.68 3.0 \n", - "4 1 ... E01025434 E02004315 5.85 3.0 \n", - "\n", - " NER_GP_PRACTICE NER_RESIDENCE NER_TREATMENT SITETRET SITEDIST \\\n", - "0 QKS QKS QHM RW601 4.89 \n", - "1 QMJ QYG QKS RY901 1.21 \n", - "2 QWE QKK QWO RJC02 15.16 \n", - "3 QMJ QRV QOP RJE07 3.16 \n", - "4 QMF QM7 QMJ RDE03 2.41 \n", - "\n", - " SITEDIST_FLAG \n", - "0 5.0 \n", - "1 5.0 \n", - "2 5.0 \n", - "3 5.0 \n", - "4 5.0 \n", - "\n", - "[5 rows x 165 columns]" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df_hes_data.head(5)" ] @@ -475,7 +239,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -501,17 +265,9 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Distinct EPIKEY count: 10000\n" - ] - } - ], + "outputs": [], "source": [ "distinct_epikey_count = get_distinct_count(df_hes_data, 'EPIKEY')\n", "print(f\"Distinct EPIKEY count: {distinct_epikey_count}\")" @@ -519,17 +275,9 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2024-05-28 14:18:00,084 - INFO -- 1024386828.py: ():24 -- saved output df to data_out/df_hes_england_count/df_hes_england_count.csv\n" - ] - } - ], + "outputs": [], "source": [ "# Creating dictionary to hold outputs\n", "outputs = {}\n", From b618ee0f3ee95ce2c846f5b92e0fc4a239fb7131 Mon Sep 17 00:00:00 2001 From: jenniferstruthers1-nhs Date: Thu, 11 Jul 2024 15:58:42 +0000 Subject: [PATCH 5/8] simplified repo for workshop --- config.toml | 8 - create_publication.py | 85 ---- data_out/.gitkeep | 0 environment.yml | 12 - pyproject.toml | 15 - rap_example_pipeline_python.ipynb | 443 +++++++++++++----- src/data_exports/__init__.py | 0 src/data_exports/write_csv.py | 47 -- src/data_ingestion/__init__.py | 0 src/data_ingestion/get_data.py | 49 -- src/data_ingestion/preprocessing.py | 3 - src/data_ingestion/reading_data.py | 27 -- src/data_ingestion/validation_checks.py | 3 - src/processing/__init__.py | 0 src/processing/aggregate_counts.py | 73 --- src/utils/__init__.py | 0 src/utils/file_paths.py | 34 -- src/utils/logging_config.py | 32 -- src/utils/spark.py | 26 - tests/__init__.py | 0 tests/backtests/README.md | 4 - tests/backtests/__init__.py | 0 tests/backtests/backtesting_params.py | 13 - .../hes_england_count_expected_output.csv | 2 - tests/backtests/test_compare_outputs.py | 30 -- tests/unittests/README.md | 5 - tests/unittests/__init__.py | 0 tests/unittests/test_aggregate_counts.py | 63 --- tests/unittests/test_spark.py | 15 - 29 files changed, 325 insertions(+), 664 deletions(-) delete mode 100644 config.toml delete mode 100644 create_publication.py delete mode 100644 data_out/.gitkeep delete mode 100644 environment.yml delete mode 100644 pyproject.toml delete mode 100644 src/data_exports/__init__.py delete mode 100644 src/data_exports/write_csv.py delete mode 100644 src/data_ingestion/__init__.py delete mode 100644 src/data_ingestion/get_data.py delete mode 100644 src/data_ingestion/preprocessing.py delete mode 100644 src/data_ingestion/reading_data.py delete mode 100644 src/data_ingestion/validation_checks.py delete mode 100644 src/processing/__init__.py delete mode 100644 src/processing/aggregate_counts.py delete mode 100644 src/utils/__init__.py delete mode 100644 src/utils/file_paths.py delete mode 100644 src/utils/logging_config.py delete mode 100644 src/utils/spark.py delete mode 100644 tests/__init__.py delete mode 100644 tests/backtests/README.md delete mode 100644 tests/backtests/__init__.py delete mode 100644 tests/backtests/backtesting_params.py delete mode 100644 tests/backtests/ground_truth/hes_england_count_expected_output.csv delete mode 100644 tests/backtests/test_compare_outputs.py delete mode 100644 tests/unittests/README.md delete mode 100644 tests/unittests/__init__.py delete mode 100644 tests/unittests/test_aggregate_counts.py delete mode 100644 tests/unittests/test_spark.py diff --git a/config.toml b/config.toml deleted file mode 100644 index 21c7a8f..0000000 --- a/config.toml +++ /dev/null @@ -1,8 +0,0 @@ -project_name = "example_pipeline_pyspark_version" - -data_url = "https://files.digital.nhs.uk/assets/Services/Artificial%20data/Artificial%20HES%20final/artificial_hes_ae_202302_v1_sample.zip" -path_to_downloaded_data = "data_in/artificial_hes_ae_202302_v1_sample.zip/artificial_hes_ae_202302_v1_sample/artificial_hes_ae_2122.csv" - -# Here we describe where the output and logs are saved, change as necessary -output_dir = '' -log_dir = '' diff --git a/create_publication.py b/create_publication.py deleted file mode 100644 index 2eb5ce3..0000000 --- a/create_publication.py +++ /dev/null @@ -1,85 +0,0 @@ -""" -Purpose of the script: to provide an example of good practices when structuring a pipeline using PySpark - -The script loads Python packages but also internal modules (e.g. modules.helpers, helpers script from the modules folder). -It then loads various configuration variables and a logger, for more info on see the RAP Community of Practice website: -https://nhsdigital.github.io/rap-community-of-practice/ - -Most of the code to carry out this configuration and setup is found in the utils folder. - -Then, the main pipeline itself begins, which has three phases: - -data_ingestion: - we download the artificial hes data, load it into a spark dataframe. Any other cleaning or preprocessing should - happen at this stage -processing: - we process the data as needed, in this case we create some aggregate counts based on the hes data -data_exports: - finally we write our outputs to an appropriate file type (CSV) - -Note that in the src folder, each of these phases has its own folder, to neatly organise the code used for each one. - -""" - -# this part imports our Python packages, pyspark functions, and our project's own modules -import logging -import timeit -from datetime import datetime - -from pyspark.sql import functions as F - -from src.utils import file_paths -from src.utils import logging_config -from src.utils import spark as spark_utils -from src.data_ingestion import get_data -from src.data_ingestion import reading_data -from src.processing import aggregate_counts -from src.data_exports import write_csv - -logger = logging.getLogger(__name__) - -def main(): - - # load config, here we load our project's parameters from the config.toml file - config = file_paths.get_config() - - # configure logging - logging_config.configure_logging(config['log_dir']) - logger.info(f"Configured logging with log folder: {config['log_dir']}.") - logger.info(f"Logging the config settings:\n\n\t{config}\n") - logger.info(f"Starting run at:\t{datetime.now().time()}") - - # get artificial HES data as CSV - get_data.download_zip_from_url(config['data_url'], overwrite=True) - logger.info(f"Downloaded artificial hes as zip.") - - # create spark session - spark = spark_utils.create_spark_session(config['project_name']) - logger.info(f"created spark session with app name: {config['project_name']}") - - # Loading data from CSV as spark data frame - df_hes_data = reading_data.load_csv_into_spark_data_frame(spark, config['path_to_downloaded_data']) - - # Creating dictionary to hold outputs - outputs = {} - - # Count number of episodes in England - place this in the outputs dictionary - outputs["df_hes_england_count"] = aggregate_counts.get_distinct_count(df_hes_data, 'epikey', 'number_of_episodes') - - # Rename and save spark dataframes as CSVs: - for output_name, output in outputs.items(): - write_csv.save_spark_dataframe_as_csv(output, output_name) - logger.info(f"saved output df {output_name} as csv") - write_csv.rename_csv_output(output_name) - logger.info(f"renamed {output_name} file") - - # stop the spark session - spark.stop() - - -if __name__ == "__main__": - print(f"Running create_publication script") - start_time = timeit.default_timer() - main() - total_time = timeit.default_timer() - start_time - logger.info(f"Running time of create_publication script: {int(total_time / 60)} minutes and {round(total_time%60)} seconds.\n") diff --git a/data_out/.gitkeep b/data_out/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/environment.yml b/environment.yml deleted file mode 100644 index 3e815a2..0000000 --- a/environment.yml +++ /dev/null @@ -1,12 +0,0 @@ -# The libraries used by your code should be listed here -name: rap_template # your project name (no spaces!) -channels: - - default - - conda-forge -dependencies: - - python=3.10.5 - - pip - - pandas=1.4.4 - - pyodbc=4.0.35 - - sqlalchemy=1.4.46 - - toml=0.10.2 diff --git a/pyproject.toml b/pyproject.toml deleted file mode 100644 index 959f9bb..0000000 --- a/pyproject.toml +++ /dev/null @@ -1,15 +0,0 @@ -[build-system] -requires = ["setuptools>=61.0"] -build-backend = "setuptools.build_meta" - -[project] -name = "rap-package-template" -version = "1.0.0" -authors = [ - { name="Data Science Skilled team", email="datascience@nhs.net" } -] -readme = "README.md" -requires-python = ">=3.10" - -[project.urls] -"Homepage" = "https://nhsdigital.github.io/rap-community-of-practice/" diff --git a/rap_example_pipeline_python.ipynb b/rap_example_pipeline_python.ipynb index 0d6996a..a3f17a9 100644 --- a/rap_example_pipeline_python.ipynb +++ b/rap_example_pipeline_python.ipynb @@ -52,7 +52,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -73,23 +73,11 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 70, "metadata": {}, "outputs": [], "source": [ - "import logging # this allows us to write log messages helping with any future auditing and debugging\n", - "import timeit # this allows us to time the execution of the code\n", - "from datetime import datetime # this allows us to work with dates and times\n", - "import pandas as pd # this allows us to work with dataframes\n", - "\n", - "# these are the modules we have created to help us with the pipeline\n", - "from src.utils import file_paths\n", - "from src.utils import logging_config\n", - "from src.utils import spark as spark_utils\n", - "import src.data_ingestion\n", - "from src.processing import aggregate_counts\n", - "from src.data_exports import write_csv\n", - "\n" + "import pandas as pd # this allows us to work with dataframes" ] }, { @@ -110,25 +98,16 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "config = file_paths.get_config() " - ] - }, - { - "cell_type": "code", - "execution_count": null, + "execution_count": 71, "metadata": {}, "outputs": [], "source": [ - "# initialise and configure logging\n", - "logger = logging.getLogger(__name__)\n", - "logging_config.configure_logging(config['log_dir'])\n", - "logger.info(f\"Configured logging with log folder: {config['log_dir']}.\")\n", - "logger.info(f\"Logging the config settings:\\n\\n\\t{config}\\n\")\n", - "logger.info(f\"Starting run at:\\t{datetime.now().time()}\")" + "zip_file_url = \"https://files.digital.nhs.uk/assets/Services/Artificial%20data/Artificial%20HES%20final/artificial_hes_ae_202302_v1_sample.zip\"\n", + "path_to_downloaded_data = \"data_in/artificial_hes_ae_202302_v1_sample.zip/artificial_hes_ae_202302_v1_sample/artificial_hes_ae_2122.csv\"\n", + "\n", + "# You can update these\n", + "col_to_aggregate = \"LSOA11\"\n", + "col_to_count = \"EPIKEY\"\n" ] }, { @@ -159,48 +138,77 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 72, "metadata": {}, "outputs": [], "source": [ - "def get_data(config):\n", - " \"\"\"Get the data from the data source and return it as a pandas dataframe\n", - " \n", - " Args:\n", - " config (dict): the configuration dictionary\n", - "\n", - " Returns:\n", - " pandas dataframe: the data \n", - " \"\"\"\n", - "\n", - " # get the data location from the config\n", - " data_location = config['data_url']\n", - " print(\"the data came from here: \", data_location) # let's print the location so you can see where it is stored - it's a publicly available zip.\n", + "import zipfile\n", + "import io\n", + "from pathlib import Path\n", + "import requests\n", "\n", - " # download the CSV file\n", - " src.data_ingestion.get_data.download_zip_from_url(data_location, overwrite=True)\n", - " logger.info(f\"Downloaded data as zip.\")\n", "\n", - " # read the CSV file into a pandas dataframe\n", - " df_data = pd.read_csv(config['path_to_downloaded_data'])\n", + "filename = Path(zip_file_url).name\n", + "output_path = f\"data_in/{filename}\"\n", "\n", - " return df_data" + "response = requests.get(zip_file_url, stream=True,timeout=3600)\n", + "downloaded_zip = zipfile.ZipFile(io.BytesIO(response.content))\n", + "downloaded_zip.extractall(output_path)\n", + "\n" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 73, "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['FYEAR', 'PARTYEAR', 'PSEUDO_HESID', 'AEKEY', 'AEKEY_FLAG', 'AEARRIVALMODE', 'AEATTEND_EXC_PLANNED', 'AEATTENDCAT', 'AEATTENDDISP', 'AEDEPTTYPE', 'AEINCLOCTYPE', 'AEPATGROUP', 'AEREFSOURCE', 'AT_GP_PRACTICE', 'AT_RESIDENCE', 'AT_TREATMENT', 'ARRIVALAGE', 'ARRIVALAGE_CALC', 'ARRIVALDATE', 'ARRIVALTIME', 'CANNET', 'CANREG', 'CCG_GP_PRACTICE', 'CCG_RESIDENCE', 'CCG_RESPONSIBILITY', 'CCG_RESPONSIBILITY_ORIGIN', 'CCG_TREATMENT', 'CCG_TREATMENT_ORIGIN', 'CR_GP_PRACTICE', 'CR_RESIDENCE', 'CR_TREATMENT', 'CONCLDUR', 'CONCLTIME', 'DEPDUR', 'DEPTIME', 'DIAG2_01', 'DIAG2_02', 'DIAG2_03', 'DIAG2_04', 'DIAG2_05', 'DIAG2_06', 'DIAG2_07', 'DIAG2_08', 'DIAG2_09', 'DIAG2_10', 'DIAG2_11', 'DIAG2_12', 'DIAG3_01', 'DIAG3_02', 'DIAG3_03', 'DIAG3_04', 'DIAG3_05', 'DIAG3_06', 'DIAG3_07', 'DIAG3_08', 'DIAG3_09', 'DIAG3_10', 'DIAG3_11', 'DIAG3_12', 'DIAGA_01', 'DIAGA_02', 'DIAGA_03', 'DIAGA_04', 'DIAGA_05', 'DIAGA_06', 'DIAGA_07', 'DIAGA_08', 'DIAGA_09', 'DIAGA_10', 'DIAGA_11', 'DIAGA_12', 'DIAGS_01', 'DIAGS_02', 'DIAGS_03', 'DIAGS_04', 'DIAGS_05', 'DIAGS_06', 'DIAGS_07', 'DIAGS_08', 'DIAGS_09', 'DIAGS_10', 'DIAGS_11', 'DIAGS_12', 'DIAGSCHEME', 'ETHNOS', 'EPIKEY', 'GORTREAT', 'GPPRAC', 'INITDUR', 'INITTIME', 'INVEST2_01', 'INVEST2_02', 'INVEST2_03', 'INVEST2_04', 'INVEST2_05', 'INVEST2_06', 'INVEST2_07', 'INVEST2_08', 'INVEST2_09', 'INVEST2_10', 'INVEST2_11', 'INVEST2_12', 'INVESTSCHEME', 'PCON', 'PCON_ONS', 'PCTCODE_HIS', 'PCTORIG_HIS', 'PCTTREAT', 'PROCODE3', 'PROCODE5', 'PROCODET', 'PROCSCHEME', 'PURCODE', 'RANK_ORDER', 'RESCTY', 'RESCTY_ONS', 'RESGOR', 'RESGOR_ONS', 'RESLADST', 'RESLADST_ONS', 'RESPCT_HIS', 'RESSTHA_HIS', 'SEX', 'LSOA01', 'MSOA01', 'STHATRET', 'SUSHRG', 'SUSLDDATE_HIS', 'TREAT2_01', 'TREAT2_02', 'TREAT2_03', 'TREAT2_04', 'TREAT2_05', 'TREAT2_06', 'TREAT2_07', 'TREAT2_08', 'TREAT2_09', 'TREAT2_10', 'TREAT2_11', 'TREAT2_12', 'TREAT3_01', 'TREAT3_02', 'TREAT3_03', 'TREAT3_04', 'TREAT3_05', 'TREAT3_06', 'TREAT3_07', 'TREAT3_08', 'TREAT3_09', 'TREAT3_10', 'TREAT3_11', 'TREAT3_12', 'TREATSCHEME', 'TRETDUR', 'TRETTIME', 'LSOA11', 'MSOA11', 'PROVDIST', 'PROVDIST_FLAG', 'NER_GP_PRACTICE', 'NER_RESIDENCE', 'NER_TREATMENT', 'SITETRET', 'SITEDIST', 'SITEDIST_FLAG']\n", + " SEX\n", + "LSOA11 \n", + "E01000009 1\n", + "E01000012 1\n", + "E01000020 1\n", + "E01000034 1\n", + "E01000052 1\n", + "... ...\n", + "W01000488 1\n", + "W01000861 1\n", + "W01001587 1\n", + "W01001685 1\n", + "W01001928 1\n", + "\n", + "[8304 rows x 1 columns]\n" + ] + }, + { + "ename": "TypeError", + "evalue": "ZMQDisplayPublisher.publish() got an unexpected keyword argument 'truncate'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[73], line 8\u001b[0m\n\u001b[1;32m 6\u001b[0m df \u001b[38;5;241m=\u001b[39m df\u001b[38;5;241m.\u001b[39mgroupby([\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mLSOA11\u001b[39m\u001b[38;5;124m\"\u001b[39m])\u001b[38;5;241m.\u001b[39mnunique()\n\u001b[1;32m 7\u001b[0m \u001b[38;5;28mprint\u001b[39m(df)\n\u001b[0;32m----> 8\u001b[0m \u001b[43mdisplay\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtruncate\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/.local/lib/python3.10/site-packages/IPython/core/display_functions.py:305\u001b[0m, in \u001b[0;36mdisplay\u001b[0;34m(include, exclude, metadata, transient, display_id, raw, clear, *objs, **kwargs)\u001b[0m\n\u001b[1;32m 302\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m metadata:\n\u001b[1;32m 303\u001b[0m \u001b[38;5;66;03m# kwarg-specified metadata gets precedence\u001b[39;00m\n\u001b[1;32m 304\u001b[0m _merge(md_dict, metadata)\n\u001b[0;32m--> 305\u001b[0m \u001b[43mpublish_display_data\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mformat_dict\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmetadata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmd_dict\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 306\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m display_id:\n\u001b[1;32m 307\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m DisplayHandle(display_id)\n", + "File \u001b[0;32m~/.local/lib/python3.10/site-packages/IPython/core/display_functions.py:93\u001b[0m, in \u001b[0;36mpublish_display_data\u001b[0;34m(data, metadata, source, transient, **kwargs)\u001b[0m\n\u001b[1;32m 90\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m transient:\n\u001b[1;32m 91\u001b[0m kwargs[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtransient\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m transient\n\u001b[0;32m---> 93\u001b[0m \u001b[43mdisplay_pub\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpublish\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 94\u001b[0m \u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 95\u001b[0m \u001b[43m \u001b[49m\u001b[43mmetadata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmetadata\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 96\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\n\u001b[1;32m 97\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[0;31mTypeError\u001b[0m: ZMQDisplayPublisher.publish() got an unexpected keyword argument 'truncate'" + ] + } + ], "source": [ - "Now we will use our get data function to... get the data! Look how simple it makes the code below to read - it does what it says on the tin" + "\n", + "# read the CSV file into a pandas dataframe\n", + "df_data = pd.read_csv(path_to_downloaded_data)" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "df_hes_data = get_data(config)" + "Now we will use our get data function to... get the data! Look how simple it makes the code below to read - it does what it says on the tin" ] }, { @@ -212,11 +220,225 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 74, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
FYEARPARTYEARPSEUDO_HESIDAEKEYAEKEY_FLAGAEARRIVALMODEAEATTEND_EXC_PLANNEDAEATTENDCATAEATTENDDISPAEDEPTTYPE...LSOA11MSOA11PROVDISTPROVDIST_FLAGNER_GP_PRACTICENER_RESIDENCENER_TREATMENTSITETRETSITEDISTSITEDIST_FLAG
02122202103TESTqPNh7HEHdm1sB5QlvVaSQZS7BekK910587081231121131...E01000385E0200176819.373.0QKSQKSQHMRW6014.895.0
12122202103TESTqPNh7HEHdm1sB5QlvVaSQZS7BekK747777461989121131...E01030571E020048333.963.0QMJQYGQKSRY9011.215.0
22122202103TESTqPNh7HEHdm1sB5QlvVaSQZS7BekK244053969711121133...E01008938E02005828NaN3.0QWEQKKQWORJC0215.165.0
32122202103TESTqPNh7HEHdm1sB5QlvVaSQZS7BekK425257514835121111...E01030533E0200091223.683.0QMJQRVQOPRJE073.165.0
42122202103TESTqPNh7HEHdm1sB5QlvVaSQZS7BekK892001219292121131...E01025434E020043155.853.0QMFQM7QMJRDE032.415.0
\n", + "

5 rows × 165 columns

\n", + "
" + ], + "text/plain": [ + " FYEAR PARTYEAR PSEUDO_HESID AEKEY \\\n", + "0 2122 202103 TESTqPNh7HEHdm1sB5QlvVaSQZS7BekK 910587081231 \n", + "1 2122 202103 TESTqPNh7HEHdm1sB5QlvVaSQZS7BekK 747777461989 \n", + "2 2122 202103 TESTqPNh7HEHdm1sB5QlvVaSQZS7BekK 244053969711 \n", + "3 2122 202103 TESTqPNh7HEHdm1sB5QlvVaSQZS7BekK 425257514835 \n", + "4 2122 202103 TESTqPNh7HEHdm1sB5QlvVaSQZS7BekK 892001219292 \n", + "\n", + " AEKEY_FLAG AEARRIVALMODE AEATTEND_EXC_PLANNED AEATTENDCAT AEATTENDDISP \\\n", + "0 1 2 1 1 3 \n", + "1 1 2 1 1 3 \n", + "2 1 2 1 1 3 \n", + "3 1 2 1 1 1 \n", + "4 1 2 1 1 3 \n", + "\n", + " AEDEPTTYPE ... LSOA11 MSOA11 PROVDIST PROVDIST_FLAG \\\n", + "0 1 ... E01000385 E02001768 19.37 3.0 \n", + "1 1 ... E01030571 E02004833 3.96 3.0 \n", + "2 3 ... E01008938 E02005828 NaN 3.0 \n", + "3 1 ... E01030533 E02000912 23.68 3.0 \n", + "4 1 ... E01025434 E02004315 5.85 3.0 \n", + "\n", + " NER_GP_PRACTICE NER_RESIDENCE NER_TREATMENT SITETRET SITEDIST \\\n", + "0 QKS QKS QHM RW601 4.89 \n", + "1 QMJ QYG QKS RY901 1.21 \n", + "2 QWE QKK QWO RJC02 15.16 \n", + "3 QMJ QRV QOP RJE07 3.16 \n", + "4 QMF QM7 QMJ RDE03 2.41 \n", + "\n", + " SITEDIST_FLAG \n", + "0 5.0 \n", + "1 5.0 \n", + "2 5.0 \n", + "3 5.0 \n", + "4 5.0 \n", + "\n", + "[5 rows x 165 columns]" + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "df_hes_data.head(5)" + "df_data.head(5)" ] }, { @@ -237,25 +459,6 @@ "Again, we create a function to do this - for a very small bit of processing like this it might not make a lot of sense, but if you were doing a larger derivation that might feasibly be used in other work, it could really save someone else some time!" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def get_distinct_count(df: pd.DataFrame, col_to_aggregate: str) -> int:\n", - " \"\"\"Returns the number of distinct values in a column of a pandas DataFrame.\n", - " \n", - " Args:\n", - " df (pd.DataFrame): the pandas DataFrame\n", - " col_to_aggregate (str): the column to aggregate\n", - "\n", - " Returns:\n", - " int: the number of distinct values\n", - " \"\"\"\n", - " return df[col_to_aggregate].nunique()" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -265,52 +468,56 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "distinct_epikey_count = get_distinct_count(df_hes_data, 'EPIKEY')\n", - "print(f\"Distinct EPIKEY count: {distinct_epikey_count}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, + "execution_count": 76, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['FYEAR', 'PARTYEAR', 'PSEUDO_HESID', 'AEKEY', 'AEKEY_FLAG', 'AEARRIVALMODE', 'AEATTEND_EXC_PLANNED', 'AEATTENDCAT', 'AEATTENDDISP', 'AEDEPTTYPE', 'AEINCLOCTYPE', 'AEPATGROUP', 'AEREFSOURCE', 'AT_GP_PRACTICE', 'AT_RESIDENCE', 'AT_TREATMENT', 'ARRIVALAGE', 'ARRIVALAGE_CALC', 'ARRIVALDATE', 'ARRIVALTIME', 'CANNET', 'CANREG', 'CCG_GP_PRACTICE', 'CCG_RESIDENCE', 'CCG_RESPONSIBILITY', 'CCG_RESPONSIBILITY_ORIGIN', 'CCG_TREATMENT', 'CCG_TREATMENT_ORIGIN', 'CR_GP_PRACTICE', 'CR_RESIDENCE', 'CR_TREATMENT', 'CONCLDUR', 'CONCLTIME', 'DEPDUR', 'DEPTIME', 'DIAG2_01', 'DIAG2_02', 'DIAG2_03', 'DIAG2_04', 'DIAG2_05', 'DIAG2_06', 'DIAG2_07', 'DIAG2_08', 'DIAG2_09', 'DIAG2_10', 'DIAG2_11', 'DIAG2_12', 'DIAG3_01', 'DIAG3_02', 'DIAG3_03', 'DIAG3_04', 'DIAG3_05', 'DIAG3_06', 'DIAG3_07', 'DIAG3_08', 'DIAG3_09', 'DIAG3_10', 'DIAG3_11', 'DIAG3_12', 'DIAGA_01', 'DIAGA_02', 'DIAGA_03', 'DIAGA_04', 'DIAGA_05', 'DIAGA_06', 'DIAGA_07', 'DIAGA_08', 'DIAGA_09', 'DIAGA_10', 'DIAGA_11', 'DIAGA_12', 'DIAGS_01', 'DIAGS_02', 'DIAGS_03', 'DIAGS_04', 'DIAGS_05', 'DIAGS_06', 'DIAGS_07', 'DIAGS_08', 'DIAGS_09', 'DIAGS_10', 'DIAGS_11', 'DIAGS_12', 'DIAGSCHEME', 'ETHNOS', 'EPIKEY', 'GORTREAT', 'GPPRAC', 'INITDUR', 'INITTIME', 'INVEST2_01', 'INVEST2_02', 'INVEST2_03', 'INVEST2_04', 'INVEST2_05', 'INVEST2_06', 'INVEST2_07', 'INVEST2_08', 'INVEST2_09', 'INVEST2_10', 'INVEST2_11', 'INVEST2_12', 'INVESTSCHEME', 'PCON', 'PCON_ONS', 'PCTCODE_HIS', 'PCTORIG_HIS', 'PCTTREAT', 'PROCODE3', 'PROCODE5', 'PROCODET', 'PROCSCHEME', 'PURCODE', 'RANK_ORDER', 'RESCTY', 'RESCTY_ONS', 'RESGOR', 'RESGOR_ONS', 'RESLADST', 'RESLADST_ONS', 'RESPCT_HIS', 'RESSTHA_HIS', 'SEX', 'LSOA01', 'MSOA01', 'STHATRET', 'SUSHRG', 'SUSLDDATE_HIS', 'TREAT2_01', 'TREAT2_02', 'TREAT2_03', 'TREAT2_04', 'TREAT2_05', 'TREAT2_06', 'TREAT2_07', 'TREAT2_08', 'TREAT2_09', 'TREAT2_10', 'TREAT2_11', 'TREAT2_12', 'TREAT3_01', 'TREAT3_02', 'TREAT3_03', 'TREAT3_04', 'TREAT3_05', 'TREAT3_06', 'TREAT3_07', 'TREAT3_08', 'TREAT3_09', 'TREAT3_10', 'TREAT3_11', 'TREAT3_12', 'TREATSCHEME', 'TRETDUR', 'TRETTIME', 'LSOA11', 'MSOA11', 'PROVDIST', 'PROVDIST_FLAG', 'NER_GP_PRACTICE', 'NER_RESIDENCE', 'NER_TREATMENT', 'SITETRET', 'SITEDIST', 'SITEDIST_FLAG']\n" + ] + }, + { + "ename": "KeyError", + "evalue": "'LSOA11'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "File \u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/core/indexes/base.py:3805\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3804\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 3805\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcasted_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3806\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n", + "File \u001b[0;32mindex.pyx:167\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mindex.pyx:196\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7081\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7089\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", + "\u001b[0;31mKeyError\u001b[0m: 'LSOA11'", + "\nThe above exception was the direct cause of the following exception:\n", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[76], line 5\u001b[0m\n\u001b[1;32m 2\u001b[0m df_data \u001b[38;5;241m=\u001b[39m df_data[[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mLSOA11\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSEX\u001b[39m\u001b[38;5;124m\"\u001b[39m]]\n\u001b[1;32m 3\u001b[0m df_data \u001b[38;5;241m=\u001b[39m df_data\u001b[38;5;241m.\u001b[39mgroupby([\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mLSOA11\u001b[39m\u001b[38;5;124m\"\u001b[39m])\u001b[38;5;241m.\u001b[39mnunique()\n\u001b[0;32m----> 5\u001b[0m distinct_epikey_count \u001b[38;5;241m=\u001b[39m \u001b[43mdf_data\u001b[49m\u001b[43m[\u001b[49m\u001b[43mcol_to_aggregate\u001b[49m\u001b[43m]\u001b[49m\u001b[38;5;241m.\u001b[39mnunique()\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDistinct EPIKEY count: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mdistinct_epikey_count\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n", + "File \u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/core/frame.py:4102\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 4100\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mnlevels \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 4101\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_getitem_multilevel(key)\n\u001b[0;32m-> 4102\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 4103\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_integer(indexer):\n\u001b[1;32m 4104\u001b[0m indexer \u001b[38;5;241m=\u001b[39m [indexer]\n", + "File \u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/core/indexes/base.py:3812\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3807\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(casted_key, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m (\n\u001b[1;32m 3808\u001b[0m \u001b[38;5;28misinstance\u001b[39m(casted_key, abc\u001b[38;5;241m.\u001b[39mIterable)\n\u001b[1;32m 3809\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28many\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(x, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m casted_key)\n\u001b[1;32m 3810\u001b[0m ):\n\u001b[1;32m 3811\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m InvalidIndexError(key)\n\u001b[0;32m-> 3812\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n\u001b[1;32m 3813\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[1;32m 3814\u001b[0m \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[1;32m 3815\u001b[0m \u001b[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[1;32m 3816\u001b[0m \u001b[38;5;66;03m# the TypeError.\u001b[39;00m\n\u001b[1;32m 3817\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_indexing_error(key)\n", + "\u001b[0;31mKeyError\u001b[0m: 'LSOA11'" + ] + } + ], "source": [ - "# Creating dictionary to hold outputs\n", - "outputs = {}\n", - "\n", - "# Count number of episodes in England - place this in the outputs dictionary\n", - "outputs[\"df_hes_england_count\"] = get_distinct_count(df_hes_data, 'EPIKEY')\n", - "\n", - "# Rename and save spark dataframes as CSVs:\n", - "for output_name, output in outputs.items():\n", "\n", - " import pandas as pd\n", "\n", - " # Create a DataFrame with the integer value\n", - " df_output = pd.DataFrame({'england_count': [outputs[\"df_hes_england_count\"]]})\n", + "print(list(df_data.columns))\n", + "df_data = df_data[[\"LSOA11\", \"SEX\"]]\n", + "df_data = df_data.groupby([\"LSOA11\"]).nunique()\n", "\n", - " # prep the filepath and ensure the directory exists\n", - " from pathlib import Path\n", - " output_file = 'my_file.csv'\n", - " output_dir = Path(f'data_out/{output_name}')\n", - " output_dir.mkdir(parents=True, exist_ok=True)\n", - " output_filename = output_dir /f'{output_name}.csv'\n", + "distinct_epikey_count = df_data[col_to_aggregate].nunique()\n", + "print(f\"Distinct EPIKEY count: {distinct_epikey_count}\")\n", "\n", - " # Save the DataFrame to a CSV file\n", - " df_output.to_csv(output_filename, index=False)\n", - " logger.info(f\"saved output df to {output_filename}\")" + "# find nice cols\n", + "# make plt bar chart showing...something cool\n", + "# double check all the comments\n", + "# update readme\n", + "# create new repo for this branch\n", + "# show sam\n", + "# ... profit\n" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/src/data_exports/__init__.py b/src/data_exports/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/data_exports/write_csv.py b/src/data_exports/write_csv.py deleted file mode 100644 index d20258e..0000000 --- a/src/data_exports/write_csv.py +++ /dev/null @@ -1,47 +0,0 @@ -import os -import glob -from pathlib import Path -from pyspark import sql as pyspark - -def save_spark_dataframe_as_csv( - df_input : pyspark.DataFrame, - output_folder : str -) -> None: - """ - Function to save a spark dataframe as a csv to a new folder in the data_out folder - - Parameters - ---------- - df_input : pyspark.DataFrame - The spark dataframe that you want to save as csv - output_folder : str - The name for the folder in which the csv file will be saved - """ - - (df_input - .coalesce(1) - .write - .mode('overwrite') - .option("header", True) - .csv(str(Path(f"data_out/{output_folder}"))) - ) - - -def rename_csv_output( - output_name : str -) -> None: - """ - By default spark gives files saved to csv random filenames. - This function will check for any CSV files in the specified subdirectory of data_out - and rename them to the same name as that subdirectory - - Parameters - ---------- - output_name : str - The name you want to give to the CSV output. This should be the - same name as the folder it is contained in. - """ - path = rf'data_out/{output_name}/*.csv' - files = glob.glob(path) - print(files) - os.rename(files[0], str(Path(f'data_out/{output_name}/{output_name}.csv')) ) diff --git a/src/data_ingestion/__init__.py b/src/data_ingestion/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/data_ingestion/get_data.py b/src/data_ingestion/get_data.py deleted file mode 100644 index 3200566..0000000 --- a/src/data_ingestion/get_data.py +++ /dev/null @@ -1,49 +0,0 @@ -"""Contains functions used to aquire the data from external sources""" - -import zipfile -import shutil -import os -import io -from pathlib import Path -import requests - - -def download_zip_from_url( - zip_file_url : str, - overwrite : bool = False, - output_path : str = None -) -> str: - """Downloads a zipfile from the specified URL - - Parameters - ---------- - zip_file_url : str - The url string of where the zipfile is held - overwrite : bool - if True, then running this again will overwrite existing files of the same name, otherwise - it will not. - output_path : str - Where you want the zip to be saved to - if left as "None" then it will be saved to - "data/{filename}" - - Returns - ---------- - output_path : str - - """ - filename = Path(zip_file_url).name - if output_path is None: - output_path = Path(f"data_in/{filename}") - else: - output_path = Path(f"{output_path}/{filename}") - if output_path.exists(): - if overwrite: - shutil.rmtree(output_path, ignore_errors=False, onerror=None) - else: - raise Exception(f"The zipfile already exists at: {output_path}") - - response = requests.get(zip_file_url, stream=True,timeout=3600) - downloaded_zip = zipfile.ZipFile(io.BytesIO(response.content)) - downloaded_zip.extractall(output_path) - - return str(output_path) diff --git a/src/data_ingestion/preprocessing.py b/src/data_ingestion/preprocessing.py deleted file mode 100644 index 547fcb3..0000000 --- a/src/data_ingestion/preprocessing.py +++ /dev/null @@ -1,3 +0,0 @@ -""" -Script which handles the pre-processing part of the pipeline. -""" diff --git a/src/data_ingestion/reading_data.py b/src/data_ingestion/reading_data.py deleted file mode 100644 index 2d3b20b..0000000 --- a/src/data_ingestion/reading_data.py +++ /dev/null @@ -1,27 +0,0 @@ -from pyspark import sql as pyspark -from pathlib import Path - -def load_csv_into_spark_data_frame( - spark : pyspark.SparkSession, - path_to_csv : str -) -> pyspark.DataFrame: - """ - loads the data from a CSV at a specified path into a spark dataframe - - Parameters - ---------- - spark : pyspark.SparkSession - The SparkSession for the spark app - path_to_csv : str - The path to the csv you want to load into a spark df as a string - - Returns - ------- - pyspark.DataFrame : - A spark dataframe containing the data that was in the CSV - """ - df_from_csv = (spark - .read - .csv(path_to_csv, header=True) - ) - return df_from_csv diff --git a/src/data_ingestion/validation_checks.py b/src/data_ingestion/validation_checks.py deleted file mode 100644 index 31e17c3..0000000 --- a/src/data_ingestion/validation_checks.py +++ /dev/null @@ -1,3 +0,0 @@ -""" -Script that performs basic validations checks on your imported data. -""" diff --git a/src/processing/__init__.py b/src/processing/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/processing/aggregate_counts.py b/src/processing/aggregate_counts.py deleted file mode 100644 index 613c539..0000000 --- a/src/processing/aggregate_counts.py +++ /dev/null @@ -1,73 +0,0 @@ - -from pyspark.sql import functions as F -from pyspark import sql as pyspark - -def get_distinct_count( - df_unaggregated : pyspark.DataFrame, - counting_col : str, - alias_name : str = "distinct_count" -) -> pyspark.DataFrame: - """ - Takes a spark dataframe and column, and returns the distinct count - of that column - - Parameters - ---------- - df_unaggregated : pyspark.DataFrame - The spark dataframe containing the column you want to count - counting_col : str - The column you want to get the counts from - alias_name : - The name for the aggregated count column - defaults to "distinct_count" if no alias_name is passed - - Returns - ------- - pyspark.DataFrame : - A spark datafram with one column (with the alias you specified) - and one row (the distinct count of values in that column) - """ - df_aggregated = (df_unaggregated - .agg(F.countDistinct(counting_col).alias(alias_name)) - ) - - return df_aggregated - - -def get_grouped_distinct_counts( - df_unaggregated : pyspark.DataFrame, - grouping_col : str, - counting_col : str, - alias_name : str = "distinct_count" -) -> pyspark.DataFrame: - """ - Takes a spark dataframe and column, groups by a specified column, and - returns the distinct count of values in another column - - Parameters - ---------- - df_unaggregated : pyspark.DataFrame - The spark dataframe containing the column you want to count - grouping_col : str - The column you want to group by - counting_col : str - The column you want to get the counts from - alias_name : - The name for the aggregated count column - defaults to "distinct_count" if no alias_name is passed - - Returns - ------- - pyspark.DataFrame : - A spark dataframe with two columns, the column you're grouping by and the distinct count (given - the alias you specify in alias_name) - and one row for each group - """ - - df_aggregated = (df_unaggregated - .groupBy(grouping_col) - .agg(F.countDistinct(counting_col).alias(alias_name)) - .orderBy(grouping_col) - ) - - return df_aggregated \ No newline at end of file diff --git a/src/utils/__init__.py b/src/utils/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/utils/file_paths.py b/src/utils/file_paths.py deleted file mode 100644 index a12fe7e..0000000 --- a/src/utils/file_paths.py +++ /dev/null @@ -1,34 +0,0 @@ -""" -Purpose of the script: loads config -""" -import logging -import toml -import pathlib - -logger = logging.getLogger(__name__) - -def get_config( - toml_path : str="config.toml" -) -> dict: - """Gets the config toml from the root directory and returns it as a dict. Can be called from any file in the project - - Parameters - ---------- - toml_path : str - Path, filename, and extension of the toml config file. - Defaults to config.toml - - Returns - ------- - Dict : - A dictionary containing details of the database, paths, etc. Should contain all the things that will - change from one run to the next - - Example - ------- - from shmi_improvement.utilities.helpers import get_config - config = get_config() - """ - return toml.load(pathlib.Path(toml_path)) - - \ No newline at end of file diff --git a/src/utils/logging_config.py b/src/utils/logging_config.py deleted file mode 100644 index 411e31d..0000000 --- a/src/utils/logging_config.py +++ /dev/null @@ -1,32 +0,0 @@ -""" -Purpose of the script: configures logging -""" -import sys -import time -import logging -from pathlib import Path - -logger = logging.getLogger(__name__) - -def configure_logging( - log_folder : str -) -> None: - """Set up logging format and location to store logs - - Please store logs in a secure location (e.g. IC Green) and not on your local machine as they may contain traces of data. - - Parameters - ---------- - log_folder : str - directory to store logs - """ - logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(levelname)s -- %(filename)s:\ - %(funcName)5s():%(lineno)s -- %(message)s', - handlers=[ - logging.FileHandler(str(Path(f".{log_folder}/{time.strftime('%Y-%m-%d_%H-%M-%S')}.log"))), - logging.StreamHandler(sys.stdout) # Add second handler to print log message to screen - ] - ) - logger = logging.getLogger(__name__) diff --git a/src/utils/spark.py b/src/utils/spark.py deleted file mode 100644 index 496f0b7..0000000 --- a/src/utils/spark.py +++ /dev/null @@ -1,26 +0,0 @@ -from pyspark import sql as pyspark - -def create_spark_session( - app_name : str = "spark_pipeline" -) -> pyspark.SparkSession: - """ - Creates a spark session: this is needed to run PySpark code. - - Parameters - ---------- - app_name : str - the name of the Spark application - Defaults to "spark_pipeline" - - Returns - ------- - pyspark.SparkSession - the SparkSession object - """ - spark_session = (pyspark.SparkSession - .builder - .appName(app_name) - .getOrCreate() - ) - - return spark_session \ No newline at end of file diff --git a/tests/__init__.py b/tests/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/backtests/README.md b/tests/backtests/README.md deleted file mode 100644 index ff2d026..0000000 --- a/tests/backtests/README.md +++ /dev/null @@ -1,4 +0,0 @@ -# Backtesting -Now that you are writing code in a reproducible manner, and perhaps using Python instead of another language, it is important that the code still produces the same results as the old code. Mistakes can easily be made in translating from one code base to another. - -By following the steps in this [guide](https://nhsdigital.github.io/rap-community-of-practice/training_resources/python/backtesting/), we can create a set of tests which will check that the outputs of the new code match the outputs of the old code. diff --git a/tests/backtests/__init__.py b/tests/backtests/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/backtests/backtesting_params.py b/tests/backtests/backtesting_params.py deleted file mode 100644 index 349b15e..0000000 --- a/tests/backtests/backtesting_params.py +++ /dev/null @@ -1,13 +0,0 @@ -import pathlib - -bt_params = { - 'output_base_path': pathlib.Path('./data_out/'), - 'ground_truth_base_path': pathlib.Path('./tests/backtests/ground_truth/'), - - 'files_to_compare': [ - { - 'new_output': 'df_hes_england_count/df_hes_england_count.csv', - 'ground_truth': 'hes_england_count_expected_output.csv', - }, - ] -} diff --git a/tests/backtests/ground_truth/hes_england_count_expected_output.csv b/tests/backtests/ground_truth/hes_england_count_expected_output.csv deleted file mode 100644 index d628546..0000000 --- a/tests/backtests/ground_truth/hes_england_count_expected_output.csv +++ /dev/null @@ -1,2 +0,0 @@ -number_of_episodes -10000 diff --git a/tests/backtests/test_compare_outputs.py b/tests/backtests/test_compare_outputs.py deleted file mode 100644 index 18f9bb1..0000000 --- a/tests/backtests/test_compare_outputs.py +++ /dev/null @@ -1,30 +0,0 @@ -""" -This script checks whether pairs of CSVs are the same as each other. - -To use: - files_to_compare: [(String, String)] is imported from params.py. It contains pairs of filenames to be tested. - OUTPUT_DIR: String and GROUND_TRUTH_DIR: String are also imported from params.py. They are the respective locations of the pair of files. - -""" - -import create_publication -import pandas as pd -import pathlib -from .backtesting_params import bt_params - -def test_backtests(): - - for backtest in bt_params['files_to_compare']: - - new_output_file = backtest['new_output'] - ground_truth_file = backtest['ground_truth'] - - if not pathlib.Path(ground_truth_file).is_file(): - create_publication.main() - - df_output = pd.read_csv(bt_params['output_base_path'] / backtest['new_output']) - df_ground_truth = pd.read_csv(bt_params['ground_truth_base_path'] / backtest['ground_truth']) - - print(f"\n Testing file: {ground_truth_file} against {new_output_file}") - - pd.testing.assert_frame_equal(df_ground_truth, df_output, check_dtype=True) diff --git a/tests/unittests/README.md b/tests/unittests/README.md deleted file mode 100644 index fc47a8a..0000000 --- a/tests/unittests/README.md +++ /dev/null @@ -1,5 +0,0 @@ -Tests are functions which make logical assertions. If all assertions are correct then the test passes, if at least one assertion is incorrect then the test fails. Tests are a useful metric for deciding if an application has met its requirements. - -Unit tests test a single piece of functionality, this functionality is delivered by a single unit of code such as a method. The philosophy behind unit tests is that if the functionality of the smallest units of the program can be guaranteed, then it is significantly more likely that the project as a whole is succeeding in delivering its functionality. - -For more information on Unit testing see [our guide](https://nhsdigital.github.io/rap-community-of-practice/training_resources/python/unit-testing/). diff --git a/tests/unittests/__init__.py b/tests/unittests/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/unittests/test_aggregate_counts.py b/tests/unittests/test_aggregate_counts.py deleted file mode 100644 index 0865c58..0000000 --- a/tests/unittests/test_aggregate_counts.py +++ /dev/null @@ -1,63 +0,0 @@ -import pytest -import pandas - -from src.processing import aggregate_counts as aggregate_counts -from src.utils import spark as spark_utils -from pyspark.sql import functions as F -from pyspark.sql import SparkSession - -def test_distinct_count(): - """ - Tests get_distinct_counts - """ - spark = spark_utils.create_spark_session('tests') - - expected_data = [ - (3,), - ] - expected_cols = ['count'] - df_expected = spark.createDataFrame(expected_data, expected_cols) - - unaggregated_data = [ - ('group_1',), - ('group_2',), - ('group_2',), - ('group_3',), - ('group_3',), - ('group_3',), - ] - unaggregated_cols = ['group_name'] - df_unaggregated = spark.createDataFrame(unaggregated_data, unaggregated_cols) - - df_actual = aggregate_counts.get_distinct_count(df_unaggregated, 'group_name', 'count') - - assert df_actual.toPandas().equals(df_expected.toPandas()) - - -def test_get_grouped_distinct_counts(): - """ - Tests get_distinct_counts - """ - spark = spark_utils.create_spark_session('tests') - - expected_data = [ - ('group_1', 1), - ('group_2', 1), - ('group_3', 2), - ] - expected_cols = ['group_name', 'count'] - df_expected = spark.createDataFrame(expected_data, expected_cols) - - unaggregated_data = [ - ('group_1', '1'), - ('group_2', '1'), - ('group_2', '1'), - ('group_3', '1'), - ('group_3', '2'), - ] - unaggregated_cols = ['group_name', 'values'] - df_unaggregated = spark.createDataFrame(unaggregated_data, unaggregated_cols) - - df_actual = aggregate_counts.get_grouped_distinct_counts(df_unaggregated, 'group_name', 'values', 'count') - - assert df_actual.toPandas().equals(df_expected.toPandas()) diff --git a/tests/unittests/test_spark.py b/tests/unittests/test_spark.py deleted file mode 100644 index 8e0757d..0000000 --- a/tests/unittests/test_spark.py +++ /dev/null @@ -1,15 +0,0 @@ -import pytest -import pandas - -from src.utils import spark as spark_utils -from pyspark.sql import functions as F -from pyspark.sql import SparkSession - -def test_create_spark_session(): - - test_app_name = 'tests' - spark = spark_utils.create_spark_session(test_app_name) - - assert spark.__class__.__name__ == 'SparkSession' - assert spark.sparkContext.appName == test_app_name - \ No newline at end of file From 12d20ab61a72306b652a36240610bb4164c4f6d4 Mon Sep 17 00:00:00 2001 From: jenniferstruthers1-nhs Date: Fri, 12 Jul 2024 10:05:34 +0000 Subject: [PATCH 6/8] updated simple notebook example --- rap_example_pipeline_python.ipynb | 251 ++++++++++++++++++------------ 1 file changed, 149 insertions(+), 102 deletions(-) diff --git a/rap_example_pipeline_python.ipynb b/rap_example_pipeline_python.ipynb index a3f17a9..61d320e 100644 --- a/rap_example_pipeline_python.ipynb +++ b/rap_example_pipeline_python.ipynb @@ -52,7 +52,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 258, "metadata": {}, "outputs": [], "source": [ @@ -73,7 +73,7 @@ }, { "cell_type": "code", - "execution_count": 70, + "execution_count": 259, "metadata": {}, "outputs": [], "source": [ @@ -98,16 +98,18 @@ }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 260, "metadata": {}, "outputs": [], "source": [ "zip_file_url = \"https://files.digital.nhs.uk/assets/Services/Artificial%20data/Artificial%20HES%20final/artificial_hes_ae_202302_v1_sample.zip\"\n", "path_to_downloaded_data = \"data_in/artificial_hes_ae_202302_v1_sample.zip/artificial_hes_ae_202302_v1_sample/artificial_hes_ae_2122.csv\"\n", "\n", - "# You can update these\n", - "col_to_aggregate = \"LSOA11\"\n", - "col_to_count = \"EPIKEY\"\n" + "# The column(s) we are going to investigate\n", + "cols_to_group = [\"AEARRIVALMODE\"]\n", + "\n", + "# where to save our graphs\n", + "output_folder = \"data_out\"\n" ] }, { @@ -123,12 +125,8 @@ "source": [ "First we will load the data: we're going to use an artificial fake version of the NHS Hospital Episode Statistics Accident and Emergency (HES AE) data from 2003. \n", "\n", - "We've hidden all the complexity of aquiring the data away in a function - called \"get_data\". This is good practice, because:\n", - "\n", - "1. this data might be used many times in many different pipelines - this function can be reused, saving your colleagues time\n", - "2. the way the data is acquired might change, e.g. in different platforms, to accomodate this we only need to add to, change or improve this function - your downstream pipeline should continue as normal\n", "\n", - "This function:\n", + "This code:\n", "- gets the location of the data from the config file\n", "- downloads the CSV\n", "- loads that CSV into a pandas dataframe in memory\n", @@ -138,7 +136,7 @@ }, { "cell_type": "code", - "execution_count": 72, + "execution_count": 261, "metadata": {}, "outputs": [], "source": [ @@ -153,62 +151,17 @@ "\n", "response = requests.get(zip_file_url, stream=True,timeout=3600)\n", "downloaded_zip = zipfile.ZipFile(io.BytesIO(response.content))\n", - "downloaded_zip.extractall(output_path)\n", - "\n" + "downloaded_zip.extractall(output_path)\n" ] }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 262, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['FYEAR', 'PARTYEAR', 'PSEUDO_HESID', 'AEKEY', 'AEKEY_FLAG', 'AEARRIVALMODE', 'AEATTEND_EXC_PLANNED', 'AEATTENDCAT', 'AEATTENDDISP', 'AEDEPTTYPE', 'AEINCLOCTYPE', 'AEPATGROUP', 'AEREFSOURCE', 'AT_GP_PRACTICE', 'AT_RESIDENCE', 'AT_TREATMENT', 'ARRIVALAGE', 'ARRIVALAGE_CALC', 'ARRIVALDATE', 'ARRIVALTIME', 'CANNET', 'CANREG', 'CCG_GP_PRACTICE', 'CCG_RESIDENCE', 'CCG_RESPONSIBILITY', 'CCG_RESPONSIBILITY_ORIGIN', 'CCG_TREATMENT', 'CCG_TREATMENT_ORIGIN', 'CR_GP_PRACTICE', 'CR_RESIDENCE', 'CR_TREATMENT', 'CONCLDUR', 'CONCLTIME', 'DEPDUR', 'DEPTIME', 'DIAG2_01', 'DIAG2_02', 'DIAG2_03', 'DIAG2_04', 'DIAG2_05', 'DIAG2_06', 'DIAG2_07', 'DIAG2_08', 'DIAG2_09', 'DIAG2_10', 'DIAG2_11', 'DIAG2_12', 'DIAG3_01', 'DIAG3_02', 'DIAG3_03', 'DIAG3_04', 'DIAG3_05', 'DIAG3_06', 'DIAG3_07', 'DIAG3_08', 'DIAG3_09', 'DIAG3_10', 'DIAG3_11', 'DIAG3_12', 'DIAGA_01', 'DIAGA_02', 'DIAGA_03', 'DIAGA_04', 'DIAGA_05', 'DIAGA_06', 'DIAGA_07', 'DIAGA_08', 'DIAGA_09', 'DIAGA_10', 'DIAGA_11', 'DIAGA_12', 'DIAGS_01', 'DIAGS_02', 'DIAGS_03', 'DIAGS_04', 'DIAGS_05', 'DIAGS_06', 'DIAGS_07', 'DIAGS_08', 'DIAGS_09', 'DIAGS_10', 'DIAGS_11', 'DIAGS_12', 'DIAGSCHEME', 'ETHNOS', 'EPIKEY', 'GORTREAT', 'GPPRAC', 'INITDUR', 'INITTIME', 'INVEST2_01', 'INVEST2_02', 'INVEST2_03', 'INVEST2_04', 'INVEST2_05', 'INVEST2_06', 'INVEST2_07', 'INVEST2_08', 'INVEST2_09', 'INVEST2_10', 'INVEST2_11', 'INVEST2_12', 'INVESTSCHEME', 'PCON', 'PCON_ONS', 'PCTCODE_HIS', 'PCTORIG_HIS', 'PCTTREAT', 'PROCODE3', 'PROCODE5', 'PROCODET', 'PROCSCHEME', 'PURCODE', 'RANK_ORDER', 'RESCTY', 'RESCTY_ONS', 'RESGOR', 'RESGOR_ONS', 'RESLADST', 'RESLADST_ONS', 'RESPCT_HIS', 'RESSTHA_HIS', 'SEX', 'LSOA01', 'MSOA01', 'STHATRET', 'SUSHRG', 'SUSLDDATE_HIS', 'TREAT2_01', 'TREAT2_02', 'TREAT2_03', 'TREAT2_04', 'TREAT2_05', 'TREAT2_06', 'TREAT2_07', 'TREAT2_08', 'TREAT2_09', 'TREAT2_10', 'TREAT2_11', 'TREAT2_12', 'TREAT3_01', 'TREAT3_02', 'TREAT3_03', 'TREAT3_04', 'TREAT3_05', 'TREAT3_06', 'TREAT3_07', 'TREAT3_08', 'TREAT3_09', 'TREAT3_10', 'TREAT3_11', 'TREAT3_12', 'TREATSCHEME', 'TRETDUR', 'TRETTIME', 'LSOA11', 'MSOA11', 'PROVDIST', 'PROVDIST_FLAG', 'NER_GP_PRACTICE', 'NER_RESIDENCE', 'NER_TREATMENT', 'SITETRET', 'SITEDIST', 'SITEDIST_FLAG']\n", - " SEX\n", - "LSOA11 \n", - "E01000009 1\n", - "E01000012 1\n", - "E01000020 1\n", - "E01000034 1\n", - "E01000052 1\n", - "... ...\n", - "W01000488 1\n", - "W01000861 1\n", - "W01001587 1\n", - "W01001685 1\n", - "W01001928 1\n", - "\n", - "[8304 rows x 1 columns]\n" - ] - }, - { - "ename": "TypeError", - "evalue": "ZMQDisplayPublisher.publish() got an unexpected keyword argument 'truncate'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[73], line 8\u001b[0m\n\u001b[1;32m 6\u001b[0m df \u001b[38;5;241m=\u001b[39m df\u001b[38;5;241m.\u001b[39mgroupby([\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mLSOA11\u001b[39m\u001b[38;5;124m\"\u001b[39m])\u001b[38;5;241m.\u001b[39mnunique()\n\u001b[1;32m 7\u001b[0m \u001b[38;5;28mprint\u001b[39m(df)\n\u001b[0;32m----> 8\u001b[0m \u001b[43mdisplay\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtruncate\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/.local/lib/python3.10/site-packages/IPython/core/display_functions.py:305\u001b[0m, in \u001b[0;36mdisplay\u001b[0;34m(include, exclude, metadata, transient, display_id, raw, clear, *objs, **kwargs)\u001b[0m\n\u001b[1;32m 302\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m metadata:\n\u001b[1;32m 303\u001b[0m \u001b[38;5;66;03m# kwarg-specified metadata gets precedence\u001b[39;00m\n\u001b[1;32m 304\u001b[0m _merge(md_dict, metadata)\n\u001b[0;32m--> 305\u001b[0m \u001b[43mpublish_display_data\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mformat_dict\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmetadata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmd_dict\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 306\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m display_id:\n\u001b[1;32m 307\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m DisplayHandle(display_id)\n", - "File \u001b[0;32m~/.local/lib/python3.10/site-packages/IPython/core/display_functions.py:93\u001b[0m, in \u001b[0;36mpublish_display_data\u001b[0;34m(data, metadata, source, transient, **kwargs)\u001b[0m\n\u001b[1;32m 90\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m transient:\n\u001b[1;32m 91\u001b[0m kwargs[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtransient\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m transient\n\u001b[0;32m---> 93\u001b[0m \u001b[43mdisplay_pub\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpublish\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 94\u001b[0m \u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 95\u001b[0m \u001b[43m \u001b[49m\u001b[43mmetadata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmetadata\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 96\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\n\u001b[1;32m 97\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", - "\u001b[0;31mTypeError\u001b[0m: ZMQDisplayPublisher.publish() got an unexpected keyword argument 'truncate'" - ] - } - ], + "outputs": [], "source": [ - "\n", "# read the CSV file into a pandas dataframe\n", - "df_data = pd.read_csv(path_to_downloaded_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we will use our get data function to... get the data! Look how simple it makes the code below to read - it does what it says on the tin" + "df = pd.read_csv(path_to_downloaded_data)" ] }, { @@ -220,7 +173,7 @@ }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 263, "metadata": {}, "outputs": [ { @@ -432,13 +385,13 @@ "[5 rows x 165 columns]" ] }, - "execution_count": 74, + "execution_count": 263, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df_data.head(5)" + "df.head(5)" ] }, { @@ -454,69 +407,163 @@ "source": [ "Now the fun part - we get to do some interesting processing on the data.\n", "\n", - "The simplest piece of processing you might do is simply get a distinct count on one of the columns. \n", - "\n", - "Again, we create a function to do this - for a very small bit of processing like this it might not make a lot of sense, but if you were doing a larger derivation that might feasibly be used in other work, it could really save someone else some time!" + "Let's group the dataframe by the specified columns in cols_to_group, counts the number of rows in each group, and create a new column \"Count\" to show this." + ] + }, + { + "cell_type": "code", + "execution_count": 264, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " AEARRIVALMODE Count\n", + "0 1 2247\n", + "1 2 7362\n", + "2 9 391\n" + ] + } + ], + "source": [ + "df = df.groupby(cols_to_group).size().reset_index(name=\"Count\")\n", + "print(df)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Let's run our simple analysis and print the result:" + "It's a little confusing what those numbers mean, so let's put them into plain English" ] }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 265, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "['FYEAR', 'PARTYEAR', 'PSEUDO_HESID', 'AEKEY', 'AEKEY_FLAG', 'AEARRIVALMODE', 'AEATTEND_EXC_PLANNED', 'AEATTENDCAT', 'AEATTENDDISP', 'AEDEPTTYPE', 'AEINCLOCTYPE', 'AEPATGROUP', 'AEREFSOURCE', 'AT_GP_PRACTICE', 'AT_RESIDENCE', 'AT_TREATMENT', 'ARRIVALAGE', 'ARRIVALAGE_CALC', 'ARRIVALDATE', 'ARRIVALTIME', 'CANNET', 'CANREG', 'CCG_GP_PRACTICE', 'CCG_RESIDENCE', 'CCG_RESPONSIBILITY', 'CCG_RESPONSIBILITY_ORIGIN', 'CCG_TREATMENT', 'CCG_TREATMENT_ORIGIN', 'CR_GP_PRACTICE', 'CR_RESIDENCE', 'CR_TREATMENT', 'CONCLDUR', 'CONCLTIME', 'DEPDUR', 'DEPTIME', 'DIAG2_01', 'DIAG2_02', 'DIAG2_03', 'DIAG2_04', 'DIAG2_05', 'DIAG2_06', 'DIAG2_07', 'DIAG2_08', 'DIAG2_09', 'DIAG2_10', 'DIAG2_11', 'DIAG2_12', 'DIAG3_01', 'DIAG3_02', 'DIAG3_03', 'DIAG3_04', 'DIAG3_05', 'DIAG3_06', 'DIAG3_07', 'DIAG3_08', 'DIAG3_09', 'DIAG3_10', 'DIAG3_11', 'DIAG3_12', 'DIAGA_01', 'DIAGA_02', 'DIAGA_03', 'DIAGA_04', 'DIAGA_05', 'DIAGA_06', 'DIAGA_07', 'DIAGA_08', 'DIAGA_09', 'DIAGA_10', 'DIAGA_11', 'DIAGA_12', 'DIAGS_01', 'DIAGS_02', 'DIAGS_03', 'DIAGS_04', 'DIAGS_05', 'DIAGS_06', 'DIAGS_07', 'DIAGS_08', 'DIAGS_09', 'DIAGS_10', 'DIAGS_11', 'DIAGS_12', 'DIAGSCHEME', 'ETHNOS', 'EPIKEY', 'GORTREAT', 'GPPRAC', 'INITDUR', 'INITTIME', 'INVEST2_01', 'INVEST2_02', 'INVEST2_03', 'INVEST2_04', 'INVEST2_05', 'INVEST2_06', 'INVEST2_07', 'INVEST2_08', 'INVEST2_09', 'INVEST2_10', 'INVEST2_11', 'INVEST2_12', 'INVESTSCHEME', 'PCON', 'PCON_ONS', 'PCTCODE_HIS', 'PCTORIG_HIS', 'PCTTREAT', 'PROCODE3', 'PROCODE5', 'PROCODET', 'PROCSCHEME', 'PURCODE', 'RANK_ORDER', 'RESCTY', 'RESCTY_ONS', 'RESGOR', 'RESGOR_ONS', 'RESLADST', 'RESLADST_ONS', 'RESPCT_HIS', 'RESSTHA_HIS', 'SEX', 'LSOA01', 'MSOA01', 'STHATRET', 'SUSHRG', 'SUSLDDATE_HIS', 'TREAT2_01', 'TREAT2_02', 'TREAT2_03', 'TREAT2_04', 'TREAT2_05', 'TREAT2_06', 'TREAT2_07', 'TREAT2_08', 'TREAT2_09', 'TREAT2_10', 'TREAT2_11', 'TREAT2_12', 'TREAT3_01', 'TREAT3_02', 'TREAT3_03', 'TREAT3_04', 'TREAT3_05', 'TREAT3_06', 'TREAT3_07', 'TREAT3_08', 'TREAT3_09', 'TREAT3_10', 'TREAT3_11', 'TREAT3_12', 'TREATSCHEME', 'TRETDUR', 'TRETTIME', 'LSOA11', 'MSOA11', 'PROVDIST', 'PROVDIST_FLAG', 'NER_GP_PRACTICE', 'NER_RESIDENCE', 'NER_TREATMENT', 'SITETRET', 'SITEDIST', 'SITEDIST_FLAG']\n" + " AEARRIVALMODE Count\n", + "0 Ambulance 2247\n", + "1 Other 7362\n", + "2 Not known 391\n" ] - }, + } + ], + "source": [ + "replace_names = {\n", + " 1: \"Ambulance\",\n", + " 2: \"Other\",\n", + " 9: \"Not known\"\n", + "}\n", + "df[\"AEARRIVALMODE\"] = df[\"AEARRIVALMODE\"].map(replace_names)\n", + "print(df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Visualising" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now it's time to take our dataframe and turn it into something nice to look at and easy to interpret!" + ] + }, + { + "cell_type": "code", + "execution_count": 266, + "metadata": {}, + "outputs": [ { - "ename": "KeyError", - "evalue": "'LSOA11'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", - "File \u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/core/indexes/base.py:3805\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3804\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 3805\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcasted_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3806\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n", - "File \u001b[0;32mindex.pyx:167\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", - "File \u001b[0;32mindex.pyx:196\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", - "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7081\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", - "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7089\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", - "\u001b[0;31mKeyError\u001b[0m: 'LSOA11'", - "\nThe above exception was the direct cause of the following exception:\n", - "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[76], line 5\u001b[0m\n\u001b[1;32m 2\u001b[0m df_data \u001b[38;5;241m=\u001b[39m df_data[[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mLSOA11\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSEX\u001b[39m\u001b[38;5;124m\"\u001b[39m]]\n\u001b[1;32m 3\u001b[0m df_data \u001b[38;5;241m=\u001b[39m df_data\u001b[38;5;241m.\u001b[39mgroupby([\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mLSOA11\u001b[39m\u001b[38;5;124m\"\u001b[39m])\u001b[38;5;241m.\u001b[39mnunique()\n\u001b[0;32m----> 5\u001b[0m distinct_epikey_count \u001b[38;5;241m=\u001b[39m \u001b[43mdf_data\u001b[49m\u001b[43m[\u001b[49m\u001b[43mcol_to_aggregate\u001b[49m\u001b[43m]\u001b[49m\u001b[38;5;241m.\u001b[39mnunique()\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDistinct EPIKEY count: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mdistinct_epikey_count\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n", - "File \u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/core/frame.py:4102\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 4100\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mnlevels \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 4101\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_getitem_multilevel(key)\n\u001b[0;32m-> 4102\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 4103\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_integer(indexer):\n\u001b[1;32m 4104\u001b[0m indexer \u001b[38;5;241m=\u001b[39m [indexer]\n", - "File \u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/core/indexes/base.py:3812\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3807\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(casted_key, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m (\n\u001b[1;32m 3808\u001b[0m \u001b[38;5;28misinstance\u001b[39m(casted_key, abc\u001b[38;5;241m.\u001b[39mIterable)\n\u001b[1;32m 3809\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28many\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(x, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m casted_key)\n\u001b[1;32m 3810\u001b[0m ):\n\u001b[1;32m 3811\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m InvalidIndexError(key)\n\u001b[0;32m-> 3812\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n\u001b[1;32m 3813\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[1;32m 3814\u001b[0m \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[1;32m 3815\u001b[0m \u001b[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[1;32m 3816\u001b[0m \u001b[38;5;66;03m# the TypeError.\u001b[39;00m\n\u001b[1;32m 3817\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_indexing_error(key)\n", - "\u001b[0;31mKeyError\u001b[0m: 'LSOA11'" - ] + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAkQAAAHHCAYAAABeLEexAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy80BEi2AAAACXBIWXMAAA9hAAAPYQGoP6dpAABN/UlEQVR4nO3dfVzN9/8/8McpdSo5J6XLj1TTqFwOW3KVixTLNpO5JhPGJz7k42J9tjXsIrMZNldfQzElfIaNPpNEGLlqy0RyvUwXGHVkFPX6/eF33vPeKUKceD/ut9v7dtt5vZ7n9X69O0c99nq/z/uohBACRERERApmYuwJEBERERkbAxEREREpHgMRERERKR4DERERESkeAxEREREpHgMRERERKR4DERERESkeAxEREREpHgMRERERKR4DERE90Pnz56FSqfDFF18YeypERE8EAxHRE3DmzBm88847eOGFF2BhYQGNRoP27dtj/vz5uHnzprGnBwBYtGgRYmNjjT2NSi1atAgqlQq+vr6V1qhUqkq3MWPGVPicfv36QaVSYdq0aRX2p6amysYxNTWFg4MD+vbti6ysLIP64cOHy+rVajUaNWqEqKgo3Lp1q8I5jxs3DgDw5ZdfQqVSYfv27ZUe4zfffAOVSoUffvhB1v7KK69ApVJh8eLFFT4vNjYWKpUKhw8frnRsfdBVqVT4+OOPK6wZPHgwVCoVrK2tDfqEEPj222/RqVMn2NjYwMrKCs2aNcPMmTNx48YNg/rOnTtL+zMxMYFGo0Hjxo0xdOhQJCcnV7h/d3f3Sl/jHj16VHpsRA+rlrEnQPS8SUxMxFtvvQW1Wo1hw4ahadOmKC0txU8//YQpU6bg2LFjWLp0qbGniUWLFqFevXoYPny4sadSobi4OLi7u+PgwYM4ffo0PD09K6zr3r07hg0bZtDeqFEjgzadTofNmzfD3d0da9aswaxZs6BSqSoc91//+hdefvll3L59G7/++iuWLFmC1NRUZGZmwsnJSVarVquxbNkyAEBRURG+//57fPTRRzhz5gzi4uIqPcYBAwZgypQpiI+PR0BAQIU18fHxsLOzQ8+ePaW2U6dO4dChQ3B3d0dcXBzGjh1b6T6qwsLCAmvWrMH7778va79x4wa+//57WFhYGDynrKwMgwYNwrp169CxY0dMnz4dVlZW2LNnD2bMmIH169dj+/btcHR0lD2vfv36iI6OlsY/ffo0NmzYgNWrV6Nfv35YvXo1zMzMZM9p2bIl/v3vfxvMwcXF5bGOm0hGEFG1OXv2rLC2thZeXl4iNzfXoP/UqVNi3rx5RpiZoSZNmgh/f/8q1Z47d04AEJ9//vmTndT/d/bsWQFAbNiwQdjb24vp06dXWAdAhIeHV3ncFStWCDMzM7Fjxw4BQKSmphrU7Ny5UwAQ69evl7UvXrxYABCfffaZrD00NFTUrl1b1lZeXi7atm0rVCqVyM/Pv++cu3XrJrRarbh165bBXH7//XdhYmIixowZI2uPiooSDg4O4rvvvhMqlUqcO3fO4LkxMTECgDh06FDFPwzx1+vap08fAUBkZGTI+uPi4oSZmZl47bXXDI7x008/FQDE5MmTDcb94YcfhImJiejRo4es3d/fXzRp0sSg/s6dO+Kf//ynACCmTp0q63NzcxPBwcGVHgNRdeEpM6JqNHv2bBQXF2P58uVwdnY26Pf09MSECROkx3fu3MFHH32Ehg0bQq1Ww93dHf/5z39QUlIie55KpcL06dMNxnN3d5et8OhPk+zduxeTJk2Cvb09ateujTfffBOXL1+WPe/YsWPYtWuXdPqhc+fOVTrGuXPnws3NDZaWlvD390dmZqbUFxMTA5VKhV9++cXgeZ9++ilMTU1x8eLFB+4jLi4OdevWRXBwMPr27XvfVZaHERcXh+7du6NLly7w9vZ+qHE7duwI4O7p0AdRqVTo0KEDhBA4e/bsfWuHDBmCoqIiJCYmGvQlJCSgvLwcgwcPlrXHx8ejb9++6NWrF7RaLeLj46t8HBXx8/ODh4eHwThxcXHo0aMHbG1tZe03b97E559/jkaNGkmrPfd67bXXEBoaiq1bt2L//v0P3L+pqSm++uor+Pj4YMGCBSgqKnqs4yF6FAxERNVo8+bNeOGFF9CuXbsq1Y8cORJRUVFo1aoV5s6dC39/f0RHR2PAgAGPNY/x48fjyJEj+PDDDzF27Fhs3rxZum4FAObNm4f69evDy8sL3377Lb799lu89957Dxx31apV+OqrrxAeHo7IyEhkZmaia9euKCgoAAD07dsXlpaWFQaNuLg4dO7cGf/4xz8euJ+4uDj06dMH5ubmGDhwoHSKqCK3bt3ClStXDLbS0lJZXW5uLnbu3ImBAwcCAAYOHIj//ve/BnWVOX/+PACgbt261Vrfp08fWFhYVBhq4uPj4ebmhvbt20ttBw4cwOnTpzFw4ECYm5ujT58+1RIYBw4ciISEBAghAABXrlzBtm3bMGjQIIPan376CdeuXcOgQYNQq1bFV17oT2Nu2bKlSvs3NTXFwIED8eeff+Knn36S9d2+fbvC17imXI9HzwljL1ERPS+KiooEAPHGG29UqT4jI0MAECNHjpS1T548WQAQO3bskNoAiA8//NBgDDc3NxEaGio91p8mCQgIEOXl5VJ7RESEMDU1FYWFhVLbo5wys7S0FL///rvUfuDAAQFARERESG0DBw4ULi4uoqysTGr7+eefBQARExPzwH0dPnxYABDJyclCiLunn+rXry8mTJhgUAug0m3NmjWy2i+++EJYWloKnU4nhBDi5MmTAoDYuHGjrE5/ymzFihXi8uXLIjc3V2zdulV4enoKlUolDh48KKvXnzK7fPmyuHz5sjh9+rT44osvhEqlEk2bNpW9Dvo5//0031tvvSUsLCxEUVGR1HbixAkBQERGRspqx40bJ1xdXaVxt23bJgCIX375RVb3MKfMPv/8c5GZmSkAiD179gghhFi4cKGwtrYWN27cMDgtOG/evAp/dve6evWqdDpOr7JTZnobN24UAMT8+fOlNjc3t0pf4+jo6ErHInpYXCEiqiY6nQ4AUKdOnSrV/+9//wMATJo0Sdauv3i0olMoVTV69GjZxcIdO3ZEWVkZfvvtt0ceEwB69+4tW+F55ZVX4OvrKx0LcHdlQL8aoxcXFwdLS0uEhIQ8cB9xcXFwdHREly5dANw9/dS/f38kJCSgrKzMoP6NN95AcnKywaZ//r3jBgcHS6/Piy++iNatW1e6ujJixAjY29vDxcUFPXr0QFFREb799lu8/PLLBrU3btyAvb097O3t4enpicmTJ6N9+/b4/vvvK71o+15DhgzBrVu3sGHDBqlNv2J07+myO3fuYO3atejfv780bteuXeHg4PDYq0RNmjRB8+bNsWbNGmn/b7zxBqysrAxqr1+/DuD+73V9n/7fRVXoP8mmH1/P19e3wtdYv9pHVB34KTOiaqLRaAAY/jKvzG+//QYTExODT085OTnBxsbmscJLgwYNZI/1p22uXbv2yGMCd0PE3zVq1Ajr1q2THnfv3h3Ozs6Ii4tDt27dUF5ejjVr1uCNN954YFgsKytDQkICunTpgnPnzkntvr6+mDNnDlJSUhAYGCh7Tv369Sv9hJZeVlYWfvnlFwwbNgynT5+W2jt37oyFCxdCp9NJr59eVFQUOnbsiOLiYmzcuBEJCQkwMan4/yEtLCywefNmAMDvv/+O2bNn49KlS7C0tLzvvPR69uwJW1tbxMfHS9eErVmzBi1atECTJk2kum3btuHy5ct45ZVXZMfRpUsXrFmzBp999lmlc6yKQYMGYc6cOYiIiMC+ffvwn//8p8I6/et4v/d6VULT3xUXF1f4nHr16j3wNSZ6XAxERNVEo9HAxcVFdpFxVVRlBaEyFa2YAHevx6iI+P/XhzxJpqamGDRoEL755hssWrQIe/fuRW5uLoYMGfLA5+7YsQN5eXlISEhAQkKCQX9cXJxBIKqK1atXAwAiIiIQERFh0P/dd9/h7bfflrU1a9ZM+iPcu3dv/Pnnnxg1ahQ6dOgAV1dXWa2pqansD3ZQUBC8vLzwzjvvGNw/qCJmZmbo168fvvnmGxQUFCAnJwenTp3C7NmzZXX6VaB+/fpVOM6uXbsMVsYexsCBAxEZGYlRo0bBzs6u0p+1t7c3AODXX39F7969K6z59ddfAQA+Pj5V3r/+305lt1ggepJ4yoyoGvXq1QtnzpxBWlraA2vd3NxQXl6OU6dOydoLCgpQWFgINzc3qa1u3booLCyU1ZWWliIvL++R5/ooQezvcwWAkydPwt3dXdY2bNgw6Z4/cXFxsLe3R1BQ0APHj4uLg4ODA9avX2+wDRw4EBs3bnzoC2mFEIiPj0eXLl0qHLd58+ZVOt00a9Ys3Lp1C5988skDa52dnREREYHNmzdX6VNWwN1TY2VlZVi7di3i4+OhUqlkp4T09wTq379/hcehX5V7HA0aNED79u2RmpqKt956q9ILpjt06AAbGxvEx8dXGspXrVoF4O6/iaooKytDfHw8rKys0KFDh0c7AKLHYeyLmIieJ6dPnxa1a9cWPj4+Bvef0ffr70Okv6h69OjRspqpU6caXFTdpk0b8dJLL8nqvv76awGgwouq/34hrf5C4Z07d0ptvr6+okWLFlU6rgddVD1x4kSD5zRv3lwEBgYKjUYjxo8f/8B9/Pnnn6JOnTpixIgRFfbv3btXABAJCQlSG6pwH6I9e/YIAGLVqlUV9n/yySfCxMREXLx4UQhR+X2IhBCiX79+Qq1Wi7y8PKmtovsQCSHElStXhJWVlcFF9pXNuby8XLi7u4uXX35ZODs7i86dO8v6v/32WwFA7N69u8LjGDVqlLCxsZHuZ/SwF1Xrpaamig8//FAcP378vsf48ccfCwBi2rRpBuNu2bJFmJiYiKCgIFl7Ve5D9O6778r6eB8ielp4yoyoGjVs2BDx8fHo378/vL29ZXeq3rdvH9avXy9dI9KiRQuEhoZi6dKlKCwshL+/Pw4ePIiVK1eid+/eslMfI0eOxJgxYxASEoLu3bvjyJEjSEpKQr169R55rq1bt8bixYvx8ccfw9PTEw4ODujatet9n+Pp6YkOHTpg7NixKCkpwbx582BnZ4epU6ca1A4bNgyTJ08GgCqdLvvhhx9w/fp1vP766xX2t23bFvb29oiLi0P//v2l9pMnT0qnxO7l6OiI7t27Iy4uDqampggODq5w3Ndffx3vvfceEhISDC5w/7spU6Zg3bp1mDdvHmbNmnXfWjs7O7z99ttYtGgRsrKypNNMlVGpVBg0aBA+/fRTAMDMmTNl/XFxcbCzs6v0lg6vv/46vvnmGyQmJqJPnz5S+4oVK7B161aD+nvvh3Uvf39/+Pv733euAPDuu+/il19+wWeffYa0tDSEhITA0tISP/30E1avXg1vb2+sXLnS4HlFRUXS6/Xnn39Kd6o+c+YMBgwYgI8++sjgORcvXqzwNba2tq70lB3RQzN2IiN6Hp08eVKMGjVKuLu7C3Nzc1GnTh3Rvn178fXXX8vuSHz79m0xY8YM4eHhIczMzISrq6uIjIw0uGtxWVmZmDZtmqhXr56wsrISQUFB4vTp05V+7L4qK0T5+fkiODhY1KlTRwC470fw711JmDNnjnB1dRVqtVp07NhRHDlypMLn5OXlCVNTU9GoUaMq/cxee+01YWFhIW7cuFFpzfDhw4WZmZm4cuWKEOL+H7v39/cXpaWlws7OTnTs2PG++/bw8JBW4O63QiSEEJ07dxYajUa6hUFlK0RCCHHmzBlhamoqe41wn1WtY8eOCQBCrVaLa9euSe0FBQWiVq1aYujQoZUew59//imsrKzEm2++KYT4671Q2XbhwoUq34G8smMsKysTMTExon379kKj0QgLCwvRpEkTMWPGDFFcXGxQ7+/vL5uDtbW1ePHFF8WQIUPEtm3bKtz3/T527+bmdt95Ez0MlRBP4SpLIlKcK1euwNnZGVFRUfjggw+MPR0iovviRdVE9ETExsairKwMQ4cONfZUiIgeiNcQEVG12rFjB44fP45PPvkEvXv3NvgEGhFRTcRTZkRUrTp37ox9+/ahffv2WL16dZW+u4yIyNgYiIiIiEjxeA0RERERKR4DERERESkeL6qugvLycuTm5qJOnTqP9b1TRERE9PQIIXD9+nW4uLg88IuPGYiqIDc31+DLHImIiOjZcOHCBdSvX/++NQxEVVCnTh0Ad3+gGo3GyLMhIiKiqtDpdHB1dZX+jt8PA1EV6E+TaTQaBiIiIqJnTFUud+FF1URERKR4DERERESkeAxEREREpHgMRERERKR4DERERESkeAxEREREpHgMRERERKR4DERERESkeAxEREREpHgMRERERKR4DERERESkeAxEREREpHgMRERERKR4DERERESkeAxEREREpHi1jD0BIjI+93cTjT0FMrLzs4KNPQUio+IKERERESkeAxEREREpHgMRERERKR4DERERESkeAxEREREpHgMRERERKR4DERERESkeAxEREREpHgMRERERKR4DERERESkeAxEREREpHgMRERERKZ5RA5G7uztUKpXBFh4eDgC4desWwsPDYWdnB2tra4SEhKCgoEA2Rk5ODoKDg2FlZQUHBwdMmTIFd+7ckdWkpqaiVatWUKvV8PT0RGxs7NM6RCIiInoGGDUQHTp0CHl5edKWnJwMAHjrrbcAABEREdi8eTPWr1+PXbt2ITc3F3369JGeX1ZWhuDgYJSWlmLfvn1YuXIlYmNjERUVJdWcO3cOwcHB6NKlCzIyMjBx4kSMHDkSSUlJT/dgiYiIqMZSCSGEsSehN3HiRGzZsgWnTp2CTqeDvb094uPj0bdvXwDAiRMn4O3tjbS0NLRt2xY//vgjevXqhdzcXDg6OgIAlixZgmnTpuHy5cswNzfHtGnTkJiYiMzMTGk/AwYMQGFhIbZu3Vqleel0Omi1WhQVFUGj0VT/gRMZmfu7icaeAhnZ+VnBxp4CUbV7mL/fNeYaotLSUqxevRojRoyASqVCeno6bt++jYCAAKnGy8sLDRo0QFpaGgAgLS0NzZo1k8IQAAQFBUGn0+HYsWNSzb1j6Gv0Y1SkpKQEOp1OthEREdHzq8YEok2bNqGwsBDDhw8HAOTn58Pc3Bw2NjayOkdHR+Tn50s194Yhfb++7341Op0ON2/erHAu0dHR0Gq10ubq6vq4h0dEREQ1WI0JRMuXL0fPnj3h4uJi7KkgMjISRUVF0nbhwgVjT4mIiIieoFrGngAA/Pbbb9i+fTs2bNggtTk5OaG0tBSFhYWyVaKCggI4OTlJNQcPHpSNpf8U2r01f/9kWkFBATQaDSwtLSucj1qthlqtfuzjIiIiomdDjVghiomJgYODA4KD/7qor3Xr1jAzM0NKSorUlp2djZycHPj5+QEA/Pz8cPToUVy6dEmqSU5OhkajgY+Pj1Rz7xj6Gv0YREREREYPROXl5YiJiUFoaChq1fprwUqr1SIsLAyTJk3Czp07kZ6ejrfffht+fn5o27YtACAwMBA+Pj4YOnQojhw5gqSkJLz//vsIDw+XVnjGjBmDs2fPYurUqThx4gQWLVqEdevWISIiwijHS0RERDWP0U+Zbd++HTk5ORgxYoRB39y5c2FiYoKQkBCUlJQgKCgIixYtkvpNTU2xZcsWjB07Fn5+fqhduzZCQ0Mxc+ZMqcbDwwOJiYmIiIjA/PnzUb9+fSxbtgxBQUFP5fiIiIio5qtR9yGqqXgfInre8T5ExPsQ0fPombwPEREREZGxMBARERGR4jEQERERkeIxEBEREZHiMRARERGR4jEQERERkeIxEBEREZHiMRARERGR4jEQERERkeIxEBEREZHiMRARERGR4jEQERERkeIxEBEREZHiMRARERGR4jEQERERkeIxEBEREZHiMRARERGR4jEQERERkeIxEBEREZHiMRARERGR4jEQERERkeIxEBEREZHiMRARERGR4jEQERERkeIxEBEREZHiMRARERGR4jEQERERkeIxEBEREZHiMRARERGR4jEQERERkeIxEBEREZHiMRARERGR4jEQERERkeIxEBEREZHiMRARERGR4jEQERERkeIxEBEREZHiGT0QXbx4EUOGDIGdnR0sLS3RrFkzHD58WOoXQiAqKgrOzs6wtLREQEAATp06JRvj6tWrGDx4MDQaDWxsbBAWFobi4mJZza+//oqOHTvCwsICrq6umD179lM5PiIiIqr5jBqIrl27hvbt28PMzAw//vgjjh8/jjlz5qBu3bpSzezZs/HVV19hyZIlOHDgAGrXro2goCDcunVLqhk8eDCOHTuG5ORkbNmyBbt378bo0aOlfp1Oh8DAQLi5uSE9PR2ff/45pk+fjqVLlz7V4yUiIqKaSSWEEMba+bvvvou9e/diz549FfYLIeDi4oJ///vfmDx5MgCgqKgIjo6OiI2NxYABA5CVlQUfHx8cOnQIbdq0AQBs3boVr776Kn7//Xe4uLhg8eLFeO+995Cfnw9zc3Np35s2bcKJEyceOE+dTgetVouioiJoNJpqOnqimsP93URjT4GM7PysYGNPgajaPczfb6OuEP3www9o06YN3nrrLTg4OOCll17CN998I/WfO3cO+fn5CAgIkNq0Wi18fX2RlpYGAEhLS4ONjY0UhgAgICAAJiYmOHDggFTTqVMnKQwBQFBQELKzs3Ht2jWDeZWUlECn08k2IiIien4ZNRCdPXsWixcvxosvvoikpCSMHTsW//rXv7By5UoAQH5+PgDA0dFR9jxHR0epLz8/Hw4ODrL+WrVqwdbWVlZT0Rj37uNe0dHR0Gq10ubq6loNR0tEREQ1lVEDUXl5OVq1aoVPP/0UL730EkaPHo1Ro0ZhyZIlxpwWIiMjUVRUJG0XLlww6nyIiIjoyTJqIHJ2doaPj4+szdvbGzk5OQAAJycnAEBBQYGspqCgQOpzcnLCpUuXZP137tzB1atXZTUVjXHvPu6lVquh0WhkGxERET2/jBqI2rdvj+zsbFnbyZMn4ebmBgDw8PCAk5MTUlJSpH6dTocDBw7Az88PAODn54fCwkKkp6dLNTt27EB5eTl8fX2lmt27d+P27dtSTXJyMho3biz7RBsREREpk1EDUUREBPbv349PP/0Up0+fRnx8PJYuXYrw8HAAgEqlwsSJE/Hxxx/jhx9+wNGjRzFs2DC4uLigd+/eAO6uKPXo0QOjRo3CwYMHsXfvXowbNw4DBgyAi4sLAGDQoEEwNzdHWFgYjh07hrVr12L+/PmYNGmSsQ6diIiIapBaxtz5yy+/jI0bNyIyMhIzZ86Eh4cH5s2bh8GDB0s1U6dOxY0bNzB69GgUFhaiQ4cO2Lp1KywsLKSauLg4jBs3Dt26dYOJiQlCQkLw1VdfSf1arRbbtm1DeHg4WrdujXr16iEqKkp2ryIiIiJSLqPeh+hZwfsQ0fOO9yEi3oeInkfPzH2IiIiIiGoCBiIiIiJSPAYiIiIiUjwGIiIiIlI8BiIiIiJSPAYiIiIiUjwGIiIiIlI8BiIiIiJSPAYiIiIiUjwGIiIiIlI8BiIiIiJSPAYiIiIiUjwGIiIiIlI8BiIiIiJSPAYiIiIiUjwGIiIiIlI8BiIiIiJSPAYiIiIiUjwGIiIiIlI8BiIiIiJSPAYiIiIiUjwGIiIiIlI8BiIiIiJSPAYiIiIiUjwGIiIiIlI8BiIiIiJSPAYiIiIiUjwGIiIiIlI8BiIiIiJSPAYiIiIiUjwGIiIiIlI8BiIiIiJSPAYiIiIiUjwGIiIiIlI8BiIiIiJSPAYiIiIiUjyjBqLp06dDpVLJNi8vL6n/1q1bCA8Ph52dHaytrRESEoKCggLZGDk5OQgODoaVlRUcHBwwZcoU3LlzR1aTmpqKVq1aQa1Ww9PTE7GxsU/j8IiIiOgZYfQVoiZNmiAvL0/afvrpJ6kvIiICmzdvxvr167Fr1y7k5uaiT58+Un9ZWRmCg4NRWlqKffv2YeXKlYiNjUVUVJRUc+7cOQQHB6NLly7IyMjAxIkTMXLkSCQlJT3V4yQiIqKaq5bRJ1CrFpycnAzai4qKsHz5csTHx6Nr164AgJiYGHh7e2P//v1o27Yttm3bhuPHj2P79u1wdHREy5Yt8dFHH2HatGmYPn06zM3NsWTJEnh4eGDOnDkAAG9vb/z000+YO3cugoKCnuqxEhERUc1k9BWiU6dOwcXFBS+88AIGDx6MnJwcAEB6ejpu376NgIAAqdbLywsNGjRAWloaACAtLQ3NmjWDo6OjVBMUFASdTodjx45JNfeOoa/Rj0FERERk1BUiX19fxMbGonHjxsjLy8OMGTPQsWNHZGZmIj8/H+bm5rCxsZE9x9HREfn5+QCA/Px8WRjS9+v77lej0+lw8+ZNWFpaGsyrpKQEJSUl0mOdTvfYx0pEREQ1l1EDUc+ePaX/bt68OXx9feHm5oZ169ZVGFSelujoaMyYMcNo+yciIqKny+inzO5lY2ODRo0a4fTp03ByckJpaSkKCwtlNQUFBdI1R05OTgafOtM/flCNRqOpNHRFRkaiqKhI2i5cuFAdh0dEREQ1VI0KRMXFxThz5gycnZ3RunVrmJmZISUlRerPzs5GTk4O/Pz8AAB+fn44evQoLl26JNUkJydDo9HAx8dHqrl3DH2NfoyKqNVqaDQa2UZERETPL6MGosmTJ2PXrl04f/489u3bhzfffBOmpqYYOHAgtFotwsLCMGnSJOzcuRPp6el4++234efnh7Zt2wIAAgMD4ePjg6FDh+LIkSNISkrC+++/j/DwcKjVagDAmDFjcPbsWUydOhUnTpzAokWLsG7dOkRERBjz0ImIiKgGMeo1RL///jsGDhyIP/74A/b29ujQoQP2798Pe3t7AMDcuXNhYmKCkJAQlJSUICgoCIsWLZKeb2pqii1btmDs2LHw8/ND7dq1ERoaipkzZ0o1Hh4eSExMREREBObPn4/69etj2bJl/Mg9ERERSVRCCGHsSdR0Op0OWq0WRUVFPH1GzyX3dxONPQUysvOzgo09BaJq9zB/v2vUNURERERExsBARERERIrHQERERESKx0BEREREisdARERERIrHQERERESKx0BEREREisdARERERIrHQERERESKx0BEREREisdARERERIrHQERERESKx0BEREREisdARERERIrHQERERESKx0BEREREisdARERERIrHQERERESKx0BEREREisdARERERIrHQERERESKx0BEREREisdARERERIrHQERERESKx0BEREREisdARERERIrHQERERESKx0BEREREisdARERERIrHQERERESK90iB6IUXXsAff/xh0F5YWIgXXnjhsSdFRERE9DQ9UiA6f/48ysrKDNpLSkpw8eLFx54UERER0dNU62GKf/jhB+m/k5KSoNVqpcdlZWVISUmBu7t7tU2OiIiI6Gl4qEDUu3dvAIBKpUJoaKisz8zMDO7u7pgzZ061TY6IiIjoaXioQFReXg4A8PDwwKFDh1CvXr0nMikiIiKip+mhApHeuXPnqnseREREREbzSIEIAFJSUpCSkoJLly5JK0d6K1aseOyJERERET0tj/QpsxkzZiAwMBApKSm4cuUKrl27JtsexaxZs6BSqTBx4kSp7datWwgPD4ednR2sra0REhKCgoIC2fNycnIQHBwMKysrODg4YMqUKbhz546sJjU1Fa1atYJarYanpydiY2MfaY5ERET0fHqkFaIlS5YgNjYWQ4cOrZZJHDp0CP/3f/+H5s2by9ojIiKQmJiI9evXQ6vVYty4cejTpw/27t0L4O4n24KDg+Hk5IR9+/YhLy8Pw4YNg5mZGT799FMAd0/vBQcHY8yYMYiLi0NKSgpGjhwJZ2dnBAUFVcv8iYiI6Nn2SCtEpaWlaNeuXbVMoLi4GIMHD8Y333yDunXrSu1FRUVYvnw5vvzyS3Tt2hWtW7dGTEwM9u3bh/379wMAtm3bhuPHj2P16tVo2bIlevbsiY8++ggLFy5EaWkpgLvhzcPDA3PmzIG3tzfGjRuHvn37Yu7cudUyfyIiInr2PVIgGjlyJOLj46tlAuHh4QgODkZAQICsPT09Hbdv35a1e3l5oUGDBkhLSwMApKWloVmzZnB0dJRqgoKCoNPpcOzYManm72MHBQVJY1SkpKQEOp1OthEREdHz65FOmd26dQtLly7F9u3b0bx5c5iZmcn6v/zyyyqNk5CQgJ9//hmHDh0y6MvPz4e5uTlsbGxk7Y6OjsjPz5dq7g1D+n593/1qdDodbt68CUtLS4N9R0dHY8aMGVU6BiIiInr2PVIg+vXXX9GyZUsAQGZmpqxPpVJVaYwLFy5gwoQJSE5OhoWFxaNM44mJjIzEpEmTpMc6nQ6urq5GnBERERE9SY8UiHbu3PnYO05PT8elS5fQqlUrqa2srAy7d+/GggULkJSUhNLSUhQWFspWiQoKCuDk5AQAcHJywsGDB2Xj6j+Fdm/N3z+ZVlBQAI1GU+HqEACo1Wqo1erHPkYiIiJ6NjzSNUTVoVu3bjh69CgyMjKkrU2bNhg8eLD032ZmZkhJSZGek52djZycHPj5+QEA/Pz8cPToUVy6dEmqSU5OhkajgY+Pj1Rz7xj6Gv0YRERERI+0QtSlS5f7nhrbsWPHA8eoU6cOmjZtKmurXbs27OzspPawsDBMmjQJtra20Gg0GD9+PPz8/NC2bVsAQGBgIHx8fDB06FDMnj0b+fn5eP/99xEeHi6t8IwZMwYLFizA1KlTMWLECOzYsQPr1q1DYmLioxw6ERERPYceKRDprx/Su337NjIyMpCZmWnwpa+PY+7cuTAxMUFISAhKSkoQFBSERYsWSf2mpqbYsmULxo4dCz8/P9SuXRuhoaGYOXOmVOPh4YHExERERERg/vz5qF+/PpYtW8Z7EBEREZFEJYQQ1TXY9OnTUVxcjC+++KK6hqwRdDodtFotioqKoNFojD0domrn/i5XTJXu/KxgY0+BqNo9zN/var2GaMiQIfweMyIiInrmVGsgSktLq3EfoSciIiJ6kEe6hqhPnz6yx0II5OXl4fDhw/jggw+qZWJERERET8sjBSKtVit7bGJigsaNG2PmzJkIDAyslokRERERPS2PFIhiYmKqex5ERERERvNIgUgvPT0dWVlZAIAmTZrgpZdeqpZJERERET1NjxSILl26hAEDBiA1NVX6Wo3CwkJ06dIFCQkJsLe3r845EhERET1Rj/Qps/Hjx+P69es4duwYrl69iqtXryIzMxM6nQ7/+te/qnuORERERE/UI60Qbd26Fdu3b4e3t7fU5uPjg4ULF/KiaiIiInrmPNIKUXl5OczMzAzazczMUF5e/tiTIiIiInqaHikQde3aFRMmTEBubq7UdvHiRURERKBbt27VNjkiIiKip+GRAtGCBQug0+ng7u6Ohg0bomHDhvDw8IBOp8PXX39d3XMkIiIieqIe6RoiV1dX/Pzzz9i+fTtOnDgBAPD29kZAQEC1To6IiIjoaXioFaIdO3bAx8cHOp0OKpUK3bt3x/jx4zF+/Hi8/PLLaNKkCfbs2fOk5kpERET0RDxUIJo3bx5GjRoFjUZj0KfVavHOO+/gyy+/rLbJERERET0NDxWIjhw5gh49elTaHxgYiPT09MeeFBEREdHT9FCBqKCgoMKP2+vVqlULly9ffuxJERERET1NDxWI/vGPfyAzM7PS/l9//RXOzs6PPSkiIiKip+mhAtGrr76KDz74ALdu3TLou3nzJj788EP06tWr2iZHRERE9DQ81Mfu33//fWzYsAGNGjXCuHHj0LhxYwDAiRMnsHDhQpSVleG99957IhMlIiIielIeKhA5Ojpi3759GDt2LCIjIyGEAACoVCoEBQVh4cKFcHR0fCITJSIiInpSHvrGjG5ubvjf//6Ha9eu4fTp0xBC4MUXX0TdunWfxPyIiIiInrhHulM1ANStWxcvv/xydc6FiIiIyCge6bvMiIiIiJ4nDERERESkeAxEREREpHgMRERERKR4DERERESkeAxEREREpHgMRERERKR4DERERESkeAxEREREpHgMRERERKR4DERERESkeAxEREREpHgMRERERKR4DERERESkeEYNRIsXL0bz5s2h0Wig0Wjg5+eHH3/8Ueq/desWwsPDYWdnB2tra4SEhKCgoEA2Rk5ODoKDg2FlZQUHBwdMmTIFd+7ckdWkpqaiVatWUKvV8PT0RGxs7NM4PCIiInpGGDUQ1a9fH7NmzUJ6ejoOHz6Mrl274o033sCxY8cAABEREdi8eTPWr1+PXbt2ITc3F3369JGeX1ZWhuDgYJSWlmLfvn1YuXIlYmNjERUVJdWcO3cOwcHB6NKlCzIyMjBx4kSMHDkSSUlJT/14iYiIqGZSCSGEsSdxL1tbW3z++efo27cv7O3tER8fj759+wIATpw4AW9vb6SlpaFt27b48ccf0atXL+Tm5sLR0REAsGTJEkybNg2XL1+Gubk5pk2bhsTERGRmZkr7GDBgAAoLC7F169YqzUmn00Gr1aKoqAgajab6D5rIyNzfTTT2FMjIzs8KNvYUiKrdw/z9rjHXEJWVlSEhIQE3btyAn58f0tPTcfv2bQQEBEg1Xl5eaNCgAdLS0gAAaWlpaNasmRSGACAoKAg6nU5aZUpLS5ONoa/Rj1GRkpIS6HQ62UZERETPL6MHoqNHj8La2hpqtRpjxozBxo0b4ePjg/z8fJibm8PGxkZW7+joiPz8fABAfn6+LAzp+/V996vR6XS4efNmhXOKjo6GVquVNldX1+o4VCIiIqqhjB6IGjdujIyMDBw4cABjx45FaGgojh8/btQ5RUZGoqioSNouXLhg1PkQERHRk1XL2BMwNzeHp6cnAKB169Y4dOgQ5s+fj/79+6O0tBSFhYWyVaKCggI4OTkBAJycnHDw4EHZePpPod1b8/dPphUUFECj0cDS0rLCOanVaqjV6mo5PiIiIqr5jL5C9Hfl5eUoKSlB69atYWZmhpSUFKkvOzsbOTk58PPzAwD4+fnh6NGjuHTpklSTnJwMjUYDHx8fqebeMfQ1+jGIiIiIjLpCFBkZiZ49e6JBgwa4fv064uPjkZqaiqSkJGi1WoSFhWHSpEmwtbWFRqPB+PHj4efnh7Zt2wIAAgMD4ePjg6FDh2L27NnIz8/H+++/j/DwcGmFZ8yYMViwYAGmTp2KESNGYMeOHVi3bh0SE/mpGiIiIrrLqIHo0qVLGDZsGPLy8qDVatG8eXMkJSWhe/fuAIC5c+fCxMQEISEhKCkpQVBQEBYtWiQ939TUFFu2bMHYsWPh5+eH2rVrIzQ0FDNnzpRqPDw8kJiYiIiICMyfPx/169fHsmXLEBQU9NSPl4iIiGqmGncfopqI9yGi5x3vQ0S8DxE9j57J+xARERERGQsDERERESkeAxEREREpHgMRERERKR4DERERESkeAxEREREpHgMRERERKR4DERERESkeAxEREREpHgMRERERKR4DERERESkeAxEREREpHgMRERERKR4DERERESkeAxEREREpHgMRERERKR4DERERESkeAxEREREpHgMRERERKR4DERERESkeAxEREREpHgMRERERKR4DERERESkeAxEREREpHgMRERERKR4DERERESkeAxEREREpHgMRERERKR4DERERESleLWNPgAD3dxONPQUysvOzgo09BSIiReMKERERESkeAxEREREpHgMRERERKR4DERERESkeAxEREREpHgMRERERKR4DERERESmeUQNRdHQ0Xn75ZdSpUwcODg7o3bs3srOzZTW3bt1CeHg47OzsYG1tjZCQEBQUFMhqcnJyEBwcDCsrKzg4OGDKlCm4c+eOrCY1NRWtWrWCWq2Gp6cnYmNjn/ThERER0TPCqIFo165dCA8Px/79+5GcnIzbt28jMDAQN27ckGoiIiKwefNmrF+/Hrt27UJubi769Okj9ZeVlSE4OBilpaXYt28fVq5cidjYWERFRUk1586dQ3BwMLp06YKMjAxMnDgRI0eORFJS0lM9XiIiIqqZVEIIYexJ6F2+fBkODg7YtWsXOnXqhKKiItjb2yM+Ph59+/YFAJw4cQLe3t5IS0tD27Zt8eOPP6JXr17Izc2Fo6MjAGDJkiWYNm0aLl++DHNzc0ybNg2JiYnIzMyU9jVgwAAUFhZi69atD5yXTqeDVqtFUVERNBpNtR8371RNxr5TNd+DZOz3INGT8DB/v2vUNURFRUUAAFtbWwBAeno6bt++jYCAAKnGy8sLDRo0QFpaGgAgLS0NzZo1k8IQAAQFBUGn0+HYsWNSzb1j6Gv0Y/xdSUkJdDqdbCMiIqLnV40JROXl5Zg4cSLat2+Ppk2bAgDy8/Nhbm4OGxsbWa2joyPy8/OlmnvDkL5f33e/Gp1Oh5s3bxrMJTo6GlqtVtpcXV2r5RiJiIioZqoxgSg8PByZmZlISEgw9lQQGRmJoqIiabtw4YKxp0RERERPUI34tvtx48Zhy5Yt2L17N+rXry+1Ozk5obS0FIWFhbJVooKCAjg5OUk1Bw8elI2n/xTavTV//2RaQUEBNBoNLC0tDeajVquhVqur5diIiIio5jPqCpEQAuPGjcPGjRuxY8cOeHh4yPpbt24NMzMzpKSkSG3Z2dnIycmBn58fAMDPzw9Hjx7FpUuXpJrk5GRoNBr4+PhINfeOoa/Rj0FERETKZtQVovDwcMTHx+P7779HnTp1pGt+tFotLC0todVqERYWhkmTJsHW1hYajQbjx4+Hn58f2rZtCwAIDAyEj48Phg4ditmzZyM/Px/vv/8+wsPDpVWeMWPGYMGCBZg6dSpGjBiBHTt2YN26dUhM5CdriIiIyMgrRIsXL0ZRURE6d+4MZ2dnaVu7dq1UM3fuXPTq1QshISHo1KkTnJycsGHDBqnf1NQUW7ZsgampKfz8/DBkyBAMGzYMM2fOlGo8PDyQmJiI5ORktGjRAnPmzMGyZcsQFBT0VI+XiIiIaqYadR+imor3IaInzdj3gOF7kIz9HiR6Ep7Z+xARERERGQMDERERESkeAxEREREpHgMRERERKR4DERERESkeAxEREREpHgMRERERKR4DERERESkeAxEREREpHgMRERERKR4DERERESkeAxEREREpHgMRERERKR4DERERESkeAxEREREpHgMRERERKR4DERERESkeAxEREREpHgMRERERKR4DERERESkeAxEREREpHgMRERERKR4DERERESkeAxEREREpHgMRERERKR4DERERESkeAxEREREpHgMRERERKR4DERERESkeAxEREREpHgMRERERKR4DERERESkeAxEREREpHgMRERERKR4DERERESkeAxEREREpHgMRERERKZ5RA9Hu3bvx2muvwcXFBSqVCps2bZL1CyEQFRUFZ2dnWFpaIiAgAKdOnZLVXL16FYMHD4ZGo4GNjQ3CwsJQXFwsq/n111/RsWNHWFhYwNXVFbNnz37Sh0ZERETPEKMGohs3bqBFixZYuHBhhf2zZ8/GV199hSVLluDAgQOoXbs2goKCcOvWLalm8ODBOHbsGJKTk7Flyxbs3r0bo0ePlvp1Oh0CAwPh5uaG9PR0fP7555g+fTqWLl36xI+PiIiIng21jLnznj17omfPnhX2CSEwb948vP/++3jjjTcAAKtWrYKjoyM2bdqEAQMGICsrC1u3bsWhQ4fQpk0bAMDXX3+NV199FV988QVcXFwQFxeH0tJSrFixAubm5mjSpAkyMjLw5ZdfyoITERERKVeNvYbo3LlzyM/PR0BAgNSm1Wrh6+uLtLQ0AEBaWhpsbGykMAQAAQEBMDExwYEDB6SaTp06wdzcXKoJCgpCdnY2rl27VuG+S0pKoNPpZBsRERE9v2psIMrPzwcAODo6ytodHR2lvvz8fDg4OMj6a9WqBVtbW1lNRWPcu4+/i46OhlarlTZXV9fHPyAiIiKqsWpsIDKmyMhIFBUVSduFCxeMPSUiIiJ6gmpsIHJycgIAFBQUyNoLCgqkPicnJ1y6dEnWf+fOHVy9elVWU9EY9+7j79RqNTQajWwjIiKi51eNDUQeHh5wcnJCSkqK1KbT6XDgwAH4+fkBAPz8/FBYWIj09HSpZseOHSgvL4evr69Us3v3bty+fVuqSU5ORuPGjVG3bt2ndDRERERUkxk1EBUXFyMjIwMZGRkA7l5InZGRgZycHKhUKkycOBEff/wxfvjhBxw9ehTDhg2Di4sLevfuDQDw9vZGjx49MGrUKBw8eBB79+7FuHHjMGDAALi4uAAABg0aBHNzc4SFheHYsWNYu3Yt5s+fj0mTJhnpqImIiKimMerH7g8fPowuXbpIj/UhJTQ0FLGxsZg6dSpu3LiB0aNHo7CwEB06dMDWrVthYWEhPScuLg7jxo1Dt27dYGJigpCQEHz11VdSv1arxbZt2xAeHo7WrVujXr16iIqK4kfuiYiISKISQghjT6Km0+l00Gq1KCoqeiLXE7m/m1jtY9Kz5fysYKPun+9BMvZ7kOhJeJi/3zX2GiIiIiKip4WBiIiIiBSPgYiIiIgUj4GIiIiIFI+BiIiIiBSPgYiIiIgUj4GIiIiIFI+BiIiIiBSPgYiIiIgUj4GIiIiIFI+BiIiIiBSPgYiIiIgUj4GIiIiIFI+BiIiIiBSPgYiIiIgUj4GIiIiIFI+BiIiIiBSPgYiIiIgUj4GIiIiIFI+BiIiIiBSPgYiIiIgUj4GIiIiIFK+WsSdARETk/m6isadARnZ+VrBR988VIiIiIlI8BiIiIiJSPAYiIiIiUjwGIiIiIlI8BiIiIiJSPAYiIiIiUjwGIiIiIlI8BiIiIiJSPAYiIiIiUjwGIiIiIlI8BiIiIiJSPAYiIiIiUjwGIiIiIlI8RQWihQsXwt3dHRYWFvD19cXBgweNPSUiIiKqARQTiNauXYtJkybhww8/xM8//4wWLVogKCgIly5dMvbUiIiIyMgUE4i+/PJLjBo1Cm+//TZ8fHywZMkSWFlZYcWKFcaeGhERERmZIgJRaWkp0tPTERAQILWZmJggICAAaWlpRpwZERER1QS1jD2Bp+HKlSsoKyuDo6OjrN3R0REnTpwwqC8pKUFJSYn0uKioCACg0+meyPzKS/58IuPSs+NJvbeqiu9B4nuQjO1JvAf1YwohHliriED0sKKjozFjxgyDdldXVyPMhpRAO8/YMyCl43uQjO1JvgevX78OrVZ73xpFBKJ69erB1NQUBQUFsvaCggI4OTkZ1EdGRmLSpEnS4/Lycly9ehV2dnZQqVRPfL5KotPp4OrqigsXLkCj0Rh7OqRAfA+SsfE9+OQIIXD9+nW4uLg8sFYRgcjc3BytW7dGSkoKevfuDeBuyElJScG4ceMM6tVqNdRqtazNxsbmKcxUuTQaDX8RkFHxPUjGxvfgk/GglSE9RQQiAJg0aRJCQ0PRpk0bvPLKK5g3bx5u3LiBt99+29hTIyIiIiNTTCDq378/Ll++jKioKOTn56Nly5bYunWrwYXWREREpDyKCUQAMG7cuApPkZHxqNVqfPjhhwanKImeFr4Hydj4HqwZVKIqn0UjIiIieo4p4saMRERERPfDQERERESKx0BEREREisdARI+tc+fOmDhx4mONcf78eahUKmRkZFTLnIgeJDY2lvcXo6dKpVJh06ZNxp4GVYKBSIHS0tJgamqK4OBgY0+F6LFduHABI0aMgIuLC8zNzeHm5oYJEybgjz/+kGrc3d0xb948402Sapzhw4dDpVJh1qxZsvZNmzY99DcS8P31fGAgUqDly5dj/Pjx2L17N3Jzc409HaJHdvbsWbRp0wanTp3CmjVrcPr0aSxZsgQpKSnw8/PD1atXn/qcbt++/dT3SY/GwsICn332Ga5du2bsqVANwECkMMXFxVi7di3Gjh2L4OBgxMbGSn2pqalQqVRISkrCSy+9BEtLS3Tt2hWXLl3Cjz/+CG9vb2g0GgwaNAh//in/Zuo7d+5g3Lhx0Gq1qFevHj744APZtwtXtFRsY2Mj2/+9ysrKEBYWBg8PD1haWqJx48aYP3++rGb48OHo3bs3vvjiCzg7O8POzg7h4eGyP0glJSWYNm0aXF1doVar4enpieXLl0v9mZmZ6NmzJ6ytreHo6IihQ4fiypUrD/lTJWMJDw+Hubk5tm3bBn9/fzRo0AA9e/bE9u3bcfHiRbz33nvo3LkzfvvtN0REREClUhn8339SUhK8vb1hbW2NHj16IC8vT9a/bNkyeHt7w8LCAl5eXli0aJHUpz/Vu3btWvj7+8PCwgJxcXFP5djp8QUEBMDJyQnR0dH3rfvuu+/QpEkTqNVquLu7Y86cOVLfg95f9/Phhx/C2dkZv/76K4C7K02ffvopRowYgTp16qBBgwZYunSp7DlHjx5F165dYWlpCTs7O4wePRrFxcUA7v4+MzExweXLlwEAV69ehYmJCQYMGCA9/+OPP0aHDh0A/PU7PyUlBW3atIGVlRXatWuH7OzsKh/Dc0WQoixfvly0adNGCCHE5s2bRcOGDUV5ebkQQoidO3cKAKJt27bip59+Ej///LPw9PQU/v7+IjAwUPz8889i9+7dws7OTsyaNUsa09/fX1hbW4sJEyaIEydOiNWrVwsrKyuxdOlSqQaA2Lhxo2wuWq1WxMTECCGEOHfunAAgfvnlFyGEEKWlpSIqKkocOnRInD17Vhpz7dq10vNDQ0OFRqMRY8aMEVlZWWLz5s0G++3Xr59wdXUVGzZsEGfOnBHbt28XCQkJQgghrl27Juzt7UVkZKTIysoSP//8s+jevbvo0qVLtf286cn5448/hEqlEp9++mmF/aNGjRJ169YVV65cEfXr1xczZ84UeXl5Ii8vTwghRExMjDAzMxMBAQHi0KFDIj09XXh7e4tBgwZJY6xevVo4OzuL7777Tpw9e1Z89913wtbWVsTGxgoh/nrfuru7SzW5ublP/uDpsYWGhoo33nhDbNiwQVhYWIgLFy4IIYTYuHGjuPdP4+HDh4WJiYmYOXOmyM7OFjExMcLS0lL63fXHH39U+P6qiP73YHl5uRg3bpxwd3cXp06dkvrd3NyEra2tWLhwoTh16pSIjo4WJiYm4sSJE0IIIYqLi4Wzs7Po06ePOHr0qEhJSREeHh4iNDRUCCFEeXm5qFevnli/fr0QQohNmzaJevXqCScnJ2kfAQEB4r333hNC/PU739fXV6Smpopjx46Jjh07inbt2j3+D/gZxECkMO3atRPz5s0TQghx+/ZtUa9ePbFz504hxF//OLZv3y7VR0dHCwDizJkzUts777wjgoKCpMf+/v7C29tbClZCCDFt2jTh7e0tPX7YQFSR8PBwERISIj0ODQ0Vbm5u4s6dO1LbW2+9Jfr37y+EECI7O1sAEMnJyRWO99FHH4nAwEBZ24ULFwQAkZ2dXek8qGbYv39/he8rvS+//FIAEAUFBcLNzU3MnTtX1h8TEyMAiNOnT0ttCxcuFI6OjtLjhg0bivj4eNnzPvroI+Hn5yeE+Ot9q/83Rc8OfSASQoi2bduKESNGCCEMA9GgQYNE9+7dZc+dMmWK8PHxkR5X9P6qCACxfv16MWjQIOHt7S1+//13Wb+bm5sYMmSI9Li8vFw4ODiIxYsXCyGEWLp0qahbt64oLi6WahITE4WJiYnIz88XQgjRp08fER4eLoQQYuLEiWLKlCmibt26IisrS5SWlgorKyuxbds2IUTFv/MTExMFAHHz5s0HHs/zhqfMFCQ7OxsHDx7EwIEDAQC1atVC//79ZaeQAKB58+bSfzs6OsLKygovvPCCrO3SpUuy57Rt21a2VOzn54dTp06hrKzskee7cOFCtG7dGvb29rC2tsbSpUuRk5Mjq2nSpAlMTU2lx87OztLcMjIyYGpqCn9//wrHP3LkCHbu3Alra2tp8/LyAgCcOXPmkedNT5d4jJvtW1lZoWHDhtLje98/N27cwJkzZxAWFiZ7j3z88ccG7482bdo88hzI+D777DOsXLkSWVlZBn1ZWVlo3769rK19+/aP/PstIiICBw4cwO7du/GPf/zDoP/e378qlQpOTk7SezIrKwstWrRA7dq1ZXMpLy+XTnP5+/sjNTUVALBr1y507doVnTp1QmpqKg4dOoTbt28bHM+9+3R2dgYAg9/xSqCo7zJTuuXLl+POnTtwcXGR2oQQUKvVWLBggdRmZmYm/bdKpZI91reVl5c/1L5VKpXBH677XXyakJCAyZMnY86cOfDz80OdOnXw+eef48CBA7K6+83N0tLyvnMqLi7Ga6+9hs8++8ygT/9LgWouT09PqFQqZGVl4c033zToz8rKQt26dWFvb1/pGBW9f/TvU/11Gd988w18fX1ldfeGcACyP1D07OnUqROCgoIQGRmJ4cOHP9F9de/eHWvWrEFSUhIGDx5s0P+4v2/1t0E5deoUjh8/jg4dOuDEiRNITU3FtWvXpGuFKtun/n9sH/Z3/POAgUgh7ty5g1WrVmHOnDkIDAyU9fXu3Rtr1qyRVkcexd+Dyv79+/Hiiy9Kfzjs7e1lF6ueOnXK4MLse+3duxft2rXDP//5T6ntYVdtmjVrhvLycuzatQsBAQEG/a1atcJ3330Hd3d31KrFfwrPGjs7O3Tv3h2LFi1CRESELADn5+cjLi4Ow4YNg0qlgrm5+UP/37yjoyNcXFxw9uzZCv9w0fNl1qxZaNmyJRo3bixr9/b2xt69e2Vte/fuRaNGjaTfbw/z/nr99dfx2muvYdCgQTA1NZVd8Pwg3t7eiI2NxY0bN6QQvnfvXpiYmEjzbtasGerWrYuPP/4YLVu2hLW1NTp37ix9mq5z585V3p/S8JSZQmzZsgXXrl1DWFgYmjZtKttCQkIMTps9rJycHEyaNAnZ2dlYs2YNvv76a0yYMEHq79q1KxYsWIBffvkFhw8fxpgxYwz+T+heL774Ig4fPoykpCScPHkSH3zwAQ4dOvRQc3J3d0doaChGjBiBTZs24dy5c0hNTcW6desA3P2E0tWrVzFw4EAcOnQIZ86cQVJSEt5+++3HOtVHT8+CBQtQUlKCoKAg7N69GxcuXMDWrVvRvXt3/OMf/8Ann3wC4O57Yffu3bh48eJDfYpwxowZiI6OxldffYWTJ0/i6NGjiImJwZdffvmkDomMpFmzZhg8eDC++uorWfu///1vpKSk4KOPPsLJkyexcuVKLFiwAJMnT5ZqHvb99eabb+Lbb7/F22+/jf/+979VnuPgwYNhYWGB0NBQZGZmYufOnRg/fjyGDh0KR0dHAHdXeDp16oS4uDgp/DRv3hwlJSVISUmp9BICYiBSjOXLlyMgIABardagLyQkBIcPH5Y++vkohg0bhps3b+KVV15BeHg4JkyYgNGjR0v9c+bMgaurKzp27IhBgwZh8uTJBsu293rnnXfQp08f9O/fH76+vvjjjz9kq0VVtXjxYvTt2xf//Oc/4eXlhVGjRuHGjRsAABcXF+zduxdlZWUIDAxEs2bNMHHiRNjY2MDEhP80ngX64PzCCy+gX79+aNiwIUaPHo0uXbogLS0Ntra2AICZM2fi/PnzaNiw4X1Pof3dyJEjsWzZMsTExKBZs2bw9/dHbGwsPDw8ntQhkRHNnDnT4FRRq1atsG7dOiQkJKBp06aIiorCzJkzZafWHuX91bdvX6xcuRJDhw7Fhg0bqvQcKysrJCUl4erVq3j55ZfRt29fdOvWTXbJA3D3OqKysjIpEJmYmKBTp05QqVQG1w/RX1Tica5IJCIiInoO8H+DiYiISPEYiIiIiEjxGIiIiIhI8RiIiIiISPEYiIiIiEjxGIiIiIhI8RiIiIiISPEYiIiIiEjxGIiI6IHS0tJgamqK4OBgWfv58+ehUqkq3Pbv3y+rvXnzJmxtbVGvXj2UlJQY7MPd3V16rpWVFZo1a4Zly5bJalJTU2X7sLe3x6uvvoqjR4/K6oYPH47evXsDAF577TX06NGjwuPas2cPVCqV7C7t77zzDkxNTbF+/XqD+unTp6Nly5aV/pw6d+4MlUqFWbNmGfQFBwdDpVJh+vTpsvZjx46hX79+sLe3h1qtRqNGjRAVFWXwXX/3/nwsLS3h7u6Ofv36YceOHbK6h3lNiOgvDERE9EDLly/H+PHjsXv3buTm5hr0b9++HXl5ebKtdevWsprvvvsOTZo0gZeXFzZt2lThfmbOnIm8vDxkZmZiyJAhGDVqFH788UeDuuzsbOTl5SEpKQklJSUIDg5GaWlphWOGhYUhOTkZv//+u0FfTEwM2rRpg+bNmwMA/vzzTyQkJGDq1KlYsWLFg34sFXJ1dUVsbKys7eLFi0hJSYGzs7Osff/+/fD19UVpaSkSExNx8uRJfPLJJ4iNjUX37t0Njkn/88nOzsaqVatgY2ODgIAA6Tvb7lWV14SI/sJARET3VVxcjLVr12Ls2LEIDg42+GMP3P3meScnJ9n29y/vXb58OYYMGYIhQ4ZU+mXCderUgZOTE1544QVMmzYNtra2SE5ONqhzcHCAk5MTWrVqhYkTJ+LChQs4ceJEhWP26tUL9vb2BvMuLi7G+vXrERYWJrWtX78ePj4+ePfdd6Uvi31YvXr1wpUrV2TfkL5y5UoEBgbCwcFBahNCICwsDN7e3tiwYQNeeeUVuLm54a233sLmzZuRlpaGuXPnVvjzadCgATp16oSlS5figw8+QFRUFLKzs2W1VXlNiOgvDEREdF/r1q2Dl5cXGjdujCFDhmDFihV42K9APHPmDNLS0tCvXz/069cPe/bswW+//VZpfXl5Ob777jtcu3YN5ubmldYVFRUhISEBACqtq1WrFoYNG4bY2FjZvNevX4+ysjIMHDhQatOHNq1Wi549e1YY/h7E3NwcgwcPRkxMjNQWGxuLESNGyOoyMjJw/PhxTJo0yeDLhFu0aIGAgACsWbPmgfubMGEChBD4/vvvH3quRPQXBiIiui99SACAHj16oKioCLt27ZLVtGvXDtbW1rLtXitWrEDPnj1Rt25d2NraIigoSBYY9KZNmwZra2uo1Wr07dsXdevWxciRIw3q6tevD2tra9jY2CA+Ph6vv/46vLy8Kj2GESNG4MyZM7J5x8TEICQkBFqtFgBw6tQp7N+/H/379wcADBkyBDExMQ8d/vT7W7duHW7cuIHdu3ejqKgIvXr1ktWcPHkSAODt7V3hGN7e3lLN/dja2sLBwQHnz5+XtT/oNSEiOQYiIqpUdnY2Dh48KK2i1KpVC/379zc45bV27VpkZGTINr2ysjKsXLlSClXA3bARGxuL8vJy2ThTpkxBRkYGduzYAV9fX8ydOxeenp4G89qzZw/S09MRGxuLRo0aYcmSJfc9Di8vL7Rr1066Luj06dPYs2eP7HTZihUrEBQUhHr16gEAXn31VRQVFRlctFwVLVq0wIsvvoj//ve/WLFiBYYOHYpatWpVWPsogauiMVQqlaztfq8JERmq+F8oERHurg7duXMHLi4uUpsQAmq1GgsWLJDaXF1dKwwuAJCUlISLFy9KKy96ZWVlSElJQffu3aW2evXqwdPTE56enli/fj2aNWuGNm3awMfHR/ZcDw8P2NjYoHHjxrh06RL69++P3bt33/dYwsLCMH78eCxcuBAxMTFo2LAh/P39pbmsXLkS+fn5suBSVlaGFStWoFu3bg/4SRkaMWIEFi5ciOPHj+PgwYMG/Y0aNQIAZGVl4aWXXjLoz8rKkmru548//sDly5fh4eEha7/fa0JEhrhCREQVunPnDlatWoU5c+bIVhmOHDkCFxeXKl3fAtwNVQMGDDBYrRgwYEClF1cDd/+g9+/fH5GRkfcdPzw8HJmZmdi4ceN96/r16wcTExPEx8dj1apVGDFihLSq8r///Q/Xr1/HL7/8IpvjmjVrsGHDBhQWFlbpWO81aNAgHD16FE2bNjUIdADQsmVLeHl5Ye7cuQYrZUeOHMH27dtl1zdVZv78+TAxMZFuM0BEj4YrRERUoS1btuDatWsICwuTrrPRCwkJwfLly6X7+/zxxx/Iz8+X1djY2OD69evYvHkzfvjhBzRt2lTWP2zYMLz55pu4evUqbG1tK5zDhAkT0LRpUxw+fBht2rSpsMbKygqjRo3Chx9+iN69exucOtKztraWApZOp8Pw4cOlvuXLlyM4OBgtWrSQPcfHxwcRERGIi4tDeHg4gLv3U/r76ac6deqgYcOGsra6desiLy+v0k92qVQqLF++HN27d0dISAgiIyPh5OSEAwcO4N///jf8/PwwceJE2XOuX7+O/Px83L59G+fOncPq1auxbNkyREdHG6wGVfaaWFhYVDgfIqXjChERVWj58uUICAgwCEPA3UB0+PBh6HQ6AEBAQACcnZ1l26ZNm7Bq1SrUrl27wlNO3bp1g6WlJVavXl3pHHx8fBAYGIioqKj7znXcuHHIysqq8GaK9woLC8O1a9cQFBQknQYsKChAYmIiQkJCDOpNTEzw5ptvylayTp48iZdeekm2vfPOOxXuz8bGBrVr1650Pu3atcP+/fthamqKnj17wtPTE5GRkQgNDUVycjLUarWsPioqCs7OzvD09MTQoUNRVFSElJQUTJs2zWDsyl4TIqqYSlTHFX1EREREzzCuEBEREZHiMRARERGR4jEQERERkeIxEBEREZHiMRARERGR4jEQERERkeIxEBEREZHiMRARERGR4jEQERERkeIxEBEREZHiMRARERGR4jEQERERkeL9P8h8DOS8Cn/rAAAAAElFTkSuQmCC", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ + "import matplotlib.pyplot as plt # this allows us to create some graphs\n", + "\n", + "# Create a combined column for x-axis labels (helpful if you are going to combine two columns together)\n", + "df['Combined'] = df[cols_to_group[0]].astype(str)\n", + "\n", + "# Plotting the bar chart\n", + "plt.bar(df['Combined'], df['Count'])\n", + "\n", + "# Setting labels and title\n", + "plt.xlabel(f\"{cols_to_group[0]}\")\n", + "plt.ylabel('Count')\n", + "plt.title(f\"Count by {cols_to_group[0]}\")\n", + "plt.savefig(f\"{output_folder}/count_by_{cols_to_group[0]}\")\n", + "plt.show()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Changing Outputs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Due to how the code is written, you can easily swap what column you are investigating. Under \"Config\", try swapping the column in cols_to_group with one of the other columns listed here. Remember to update the name mapping too.\n", + "\n", + "ACTIVAGE - Patient's Age\n", + "\n", + "AEATTENDDISP - Attendance Disposal\n", + "\n", + "AEINCLOCTYPE - Where incident occurred\n", + "\n", + "AEREFSOURCE - Source of referral\n", + "\n", + "ETHNOS - Ethnicity of patient\n", "\n", + "RESGOR_ONS - Region of Residence\n", + "\n", + "SEX - Patient's Sex\n", + "\n", + "Check out the HES Technical Output Specification to understand what each column mean and the possible options.\n", + "\n", + "https://digital.nhs.uk/data-and-information/data-tools-and-services/data-services/hospital-episode-statistics/hospital-episode-statistics-data-dictionary\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Advanced" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You could try grouping by two or more columns, to understand the data a little deeper.\n", "\n", - "print(list(df_data.columns))\n", - "df_data = df_data[[\"LSOA11\", \"SEX\"]]\n", - "df_data = df_data.groupby([\"LSOA11\"]).nunique()\n", + "You can add as many columns as you wish to the cols_to_group list, but in order to visualise them correctly you will need to combine them together:\n", "\n", - "distinct_epikey_count = df_data[col_to_aggregate].nunique()\n", - "print(f\"Distinct EPIKEY count: {distinct_epikey_count}\")\n", + "**df['Combined'] = df[cols_to_group[0]].astype(str) + \"-\" + df[cols_to_group[1]].astype(str)**\n", "\n", - "# find nice cols\n", - "# make plt bar chart showing...something cool\n", - "# double check all the comments\n", - "# update readme\n", - "# create new repo for this branch\n", - "# show sam\n", - "# ... profit\n" + "Remember to update the title and labels too." ] } ], From 6ff4c383a2c1492778f3228a955204128cc0b527 Mon Sep 17 00:00:00 2001 From: jenniferstruthers1-nhs Date: Fri, 12 Jul 2024 10:54:06 +0000 Subject: [PATCH 7/8] updated comments --- rap_example_pipeline_python.ipynb | 58 +++++++++++++++++-------------- 1 file changed, 32 insertions(+), 26 deletions(-) diff --git a/rap_example_pipeline_python.ipynb b/rap_example_pipeline_python.ipynb index 61d320e..3715e09 100644 --- a/rap_example_pipeline_python.ipynb +++ b/rap_example_pipeline_python.ipynb @@ -52,7 +52,7 @@ }, { "cell_type": "code", - "execution_count": 258, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -73,11 +73,19 @@ }, { "cell_type": "code", - "execution_count": 259, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ - "import pandas as pd # this allows us to work with dataframes" + "# These libraries will help us download the file\n", + "import zipfile\n", + "import io\n", + "from pathlib import Path\n", + "import requests\n", + "\n", + "\n", + "import pandas as pd # this allows us to work with dataframes\n", + "import matplotlib.pyplot as plt # this allows us to create some graphs\n" ] }, { @@ -91,14 +99,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "It's important that we don't hardcode things which can change into the code - instead we keep things like that in config files.\n", - "\n", - "An example is where the data is to be picked up from and where any outputs will be saved to: these will change from when you are working in \"dev\" to when the code is finalised and put into \"production\"." + "Quite often, when we create an analytical process in code, there will be parts of the code that we need to update - dates, URLs, file paths, and so on.\n", + " \n", + "We usually put these in a separate file called a config file. That way, all our settings are in one place. Here we'll just put our config settings in this cell below." ] }, { "cell_type": "code", - "execution_count": 260, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -129,23 +137,17 @@ "This code:\n", "- gets the location of the data from the config file\n", "- downloads the CSV\n", - "- loads that CSV into a pandas dataframe in memory\n", + "- saves it to our data_in folder\n", "\n", "This is just an example - in another setting we could make it load the data from a SQL server, or from a database, S3 bucket, etc." ] }, { "cell_type": "code", - "execution_count": 261, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ - "import zipfile\n", - "import io\n", - "from pathlib import Path\n", - "import requests\n", - "\n", - "\n", "filename = Path(zip_file_url).name\n", "output_path = f\"data_in/{filename}\"\n", "\n", @@ -154,13 +156,21 @@ "downloaded_zip.extractall(output_path)\n" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we need to read in our data and store it so we can continue to use and manipulate it. We'll use the pandas method read_csv to turn the data within the csv file to a pandas dataframe (commonly referred to as df).\n", + "\n", + "A pandas DataFrame is a two-dimensional, labeled data structure in Python, similar to a table in a database or an Excel spreadsheet, that allows for the storage and manipulation of data across rows and columns" + ] + }, { "cell_type": "code", - "execution_count": 262, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ - "# read the CSV file into a pandas dataframe\n", "df = pd.read_csv(path_to_downloaded_data)" ] }, @@ -173,7 +183,7 @@ }, { "cell_type": "code", - "execution_count": 263, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -385,7 +395,7 @@ "[5 rows x 165 columns]" ] }, - "execution_count": 263, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -412,7 +422,7 @@ }, { "cell_type": "code", - "execution_count": 264, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -440,7 +450,7 @@ }, { "cell_type": "code", - "execution_count": 265, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -480,7 +490,7 @@ }, { "cell_type": "code", - "execution_count": 266, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -495,8 +505,6 @@ } ], "source": [ - "import matplotlib.pyplot as plt # this allows us to create some graphs\n", - "\n", "# Create a combined column for x-axis labels (helpful if you are going to combine two columns together)\n", "df['Combined'] = df[cols_to_group[0]].astype(str)\n", "\n", @@ -524,8 +532,6 @@ "source": [ "Due to how the code is written, you can easily swap what column you are investigating. Under \"Config\", try swapping the column in cols_to_group with one of the other columns listed here. Remember to update the name mapping too.\n", "\n", - "ACTIVAGE - Patient's Age\n", - "\n", "AEATTENDDISP - Attendance Disposal\n", "\n", "AEINCLOCTYPE - Where incident occurred\n", From a3b7d4d2a625993533d8d34115c735ff57ab490d Mon Sep 17 00:00:00 2001 From: jenniferstruthers1-nhs Date: Fri, 12 Jul 2024 11:14:05 +0000 Subject: [PATCH 8/8] removed savefig --- rap_example_pipeline_python.ipynb | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/rap_example_pipeline_python.ipynb b/rap_example_pipeline_python.ipynb index 3715e09..fd517d8 100644 --- a/rap_example_pipeline_python.ipynb +++ b/rap_example_pipeline_python.ipynb @@ -52,7 +52,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -73,7 +73,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -114,10 +114,7 @@ "path_to_downloaded_data = \"data_in/artificial_hes_ae_202302_v1_sample.zip/artificial_hes_ae_202302_v1_sample/artificial_hes_ae_2122.csv\"\n", "\n", "# The column(s) we are going to investigate\n", - "cols_to_group = [\"AEARRIVALMODE\"]\n", - "\n", - "# where to save our graphs\n", - "output_folder = \"data_out\"\n" + "cols_to_group = [\"AEARRIVALMODE\"]\n" ] }, { @@ -515,7 +512,6 @@ "plt.xlabel(f\"{cols_to_group[0]}\")\n", "plt.ylabel('Count')\n", "plt.title(f\"Count by {cols_to_group[0]}\")\n", - "plt.savefig(f\"{output_folder}/count_by_{cols_to_group[0]}\")\n", "plt.show()\n" ] },