update readme; remove unneeded packages; reduce debug print statements

Brown-University-Library · Jul 16, 2024 · f328960 · f328960
1 parent c8d5d2e
commit f328960
Show file tree

Hide file tree

Showing 4 changed files with 64 additions and 15 deletions.
diff --git a/README.md b/README.md
@@ -2,13 +2,64 @@
 
 Combines 7 FileMakerPro export merge-CSV format tables into one, for easier subsequent processing.
 
+## Installation
 
-# Usage
+1. Clone the repository
+2. Create a virtual environment and activate it (optional but recommended)
+```bash
+python3 -m venv ./env
+source ./env/bin/activate
+```
+3. Install the required packages
+```bash
+pip install -r requirements.txt
+```
 
-Typical usage:
+## Usage
 
-`python ./fmp_denormalize.py --input_dir "/path/to/input_dir/" --ouput_path "/path/to/output_file"`
+This script denormalizes data from the FileMaker Pro database export into a single CSV file. It can process input either from a directory containing the required CSV files or from a zip file containing these files. The output is a denormalized CSV file that combines information from all input files.
 
-See `python ./fmp_denormalize.py --help` for more info.
+The directory or zip file must contain the following 7 CSV files:
+- alternative_name.csv
+- folders.csv
+- locations.csv
+- members.csv
+- related_collections.csv
+- sources.csv
+- subjects.csv
+
+### Note
+Warnings such as
+```
+Warning: column Notes is present in both main_data and sources_data
+```
+are expected and can be ignored. They are generated when the script encounters columns with the same name in multiple input files. The script will rename these columns to avoid conflicts.
+
+
+### Command Line Arguments
+
+- `--input_dir`: Specifies the path to a directory containing all 7 required FMP CSV files. You must specify either `input_dir` or `input_zip`, but not both.
+- `--input_zip`: Specifies the path to the zip file containing all 7 required FMP CSV files. You must specify either `input_dir` or `input_zip`, but not both.
+- `--output_path`: Specifies the path for the output CSV file or the directory where the denormalized data will be saved. If a directory is specified, the output file will be named `fmp_denormalized.csv` within that directory. If a file is specified, the output will be saved to that file. In either case, any existing file with the same name will be overwritten.
+- `--limit_orgs`: (Optional) Limits the organizations to include in the output. Specify a path to a text or CSV file containing a list of organization IDs to include.
+
+### Examples
+
+1. **Using a Directory of CSV Files for input and a directory for output**
+
+   ```bash
+   python fmp_denormalize.py --input_dir /path/to/csv/files --output_path /path/to/output/
+
+2. **Using a Zip File of CSV Files for input and a file for output**
+
+   ```bash
+   python fmp_denormalize.py --input_zip /path/to/csv/files.zip --output_path /path/to/output/combined_files_2024-07-16.csv
+   ```
+
+3. **Limiting Organizations**
+
+   ```bash
+    python fmp_denormalize.py --input_dir /path/to/csv/files --output_path /path/to/output/fmp_denormalized.csv --limit_orgs /path/to/orgs.txt
+    ```
 
 ---
diff --git a/fmp_denormalize.py b/fmp_denormalize.py
@@ -1,9 +1,7 @@
-import os, sys
+import os
 import zipfile
 import pandas as pd
 import argparse
-import pathlib
-from tqdm import tqdm
 
 def read_csv(input_file):
     data = pd.read_csv(input_file, dtype=str).fillna('')
@@ -64,7 +62,7 @@ def handle_duplicates(df, join_on, unique_only=True):
 def save_data_to_csv(data, output_file):
     data.to_csv(output_file, index=False)
     #for testing, save a version of the data with only the first 500 rows
-    data.head(500).to_csv(output_file.replace('.csv', '_head500.csv'), index=False)
+    # data.head(500).to_csv(output_file.replace('.csv', '_head500.csv'), index=False)
 
 def prep_limit_orgs(orgs_to_include):
     # The format of the org IDs coming from FMP is different than in other places
@@ -127,7 +125,7 @@ def prep_limit_orgs(orgs_to_include):
             data = read_csvs_from_zip(args.input_zip, expected_filenames)
 
         print('Data read in successfully')
-        print(f'folder data: {data["folders"].head(20)}')
+        # print(f'folder data: {data["folders"].head(20)}')
 
         # #print any rows in folders with float values
         # for index, row in data['folders'].iterrows():
@@ -222,6 +220,9 @@ def prep_limit_orgs(orgs_to_include):
 
         save_data_to_csv(main_data, output_file)
 
+        # Print Done message in green
+        print('\033[92m' + 'Done!' + '\033[0m')
+
     except Exception as e:
         print(f'An error occurred: {str(e)}')
         exit(1)
diff --git a/requirements.in b/requirements.in
@@ -1,2 +1 @@
 pandas==2.0.3
-tqdm==4.66.2
diff --git a/requirements.txt b/requirements.txt
@@ -1,20 +1,18 @@
 #
-# This file is autogenerated by pip-compile with Python 3.8
+# This file is autogenerated by pip-compile with Python 3.10
 # by the following command:
 #
-#    pip-compile ./requirements.in
+#    pip-compile requirements.in
 #
 numpy==1.24.4
     # via pandas
 pandas==2.0.3
-    # via -r ./requirements.in
+    # via -r requirements.in
 python-dateutil==2.9.0.post0
     # via pandas
 pytz==2024.1
     # via pandas
 six==1.16.0
     # via python-dateutil
-tqdm==4.66.2
-    # via -r ./requirements.in
 tzdata==2024.1
     # via pandas