Merge pull request #328 from Emory-HITI/dev

Experimental new modules
Emory-HITI · Jul 28, 2022 · e1ec735 · e1ec735
2 parents 43a4929 + 7142e62
commit e1ec735
Show file tree

Hide file tree

Showing 28 changed files with 826 additions and 78 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -7,14 +7,14 @@ jobs:
   run_unit_tests:
     name: Run Unit Tests
     runs-on: ubuntu-latest
-    container: "centos:latest"
+    container: "ubuntu:latest"
     steps:
       - uses: actions/checkout@master
       - name: Install Python 3
         run: |
-          yum update -y -q
-          yum install -q -y python3 
-          yum install -q -y python3-pip unzip
+          apt-get update -y -q
+          apt-get install -q -y python3 
+          apt-get install -q -y python3-pip unzip
       - name: Install Dependencies
         run: |
           pip3 install -U pip
@@ -28,14 +28,14 @@ jobs:
   run_integration_tests:
     name: Run Integration Tests
     runs-on: ubuntu-latest
-    container: "centos:latest"
+    container: "ubuntu:latest"
     steps:
       - uses: actions/checkout@master
       - name: Install Python 3
         run: |
-          yum update -y -q
-          yum install -q -y python3 
-          yum install -q -y python3-pip unzip
+          apt-get update -y -q
+          apt-get install -q -y python3 
+          apt-get install -q -y python3-pip unzip
       - name: Install Dependencies
         run: |
           pip3 install -U pip

diff --git a/.gitignore b/.gitignore
@@ -9,4 +9,5 @@ __pycache__
 htmlcov
 .coverage
 coverage.xml
+**/*.log
 /tests/data
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -24,5 +24,7 @@ This is a discussion page to help the developers get started soon.
 
 * Please develop against the [dev](https://github.com/Emory-HITI/Niffler/tree/dev) branch. New contributors are encouraged to submit pull requests, rather than directly committing even when you have committer access. Another developer can then review and merge your pull request.
 
+* The pull request should be minimal. Avoid including irrelevant changes in your pull request -- for example, unused library imports, line break changes, adding/removing new lines that interpret an unchanged code segment as changed. Inclusion of irrelevant changes in your pull request dilutes your actual contribution, and make it hard for the developers to merge and review. Please check with a "git diff" before committing your changes, to avoid such additions. 
+
 * Some important requests for enhancements are tracked in the [bug tracker](https://github.com/Emory-HITI/Niffler/issues).
 
diff --git a/install.sh b/install.sh
@@ -2,12 +2,16 @@
 echo "Configuring Niffler"
 sudo chmod -R 777 .
 
+wget -qO- https://get.nextflow.io | bash
+sudo mv nextflow /usr/local/bin
+
 PIP=`head -n 1 init/pip.out`
 if [ "$PIP" = false ] ; then
     sudo yum install -y python3
     echo "Installing pip"
     sudo yum install python3-pip
     pip install -r requirements.txt
+    pip install -i https://test.pypi.org/simple/ HITI-anon-internal
     wget https://repo.anaconda.com/archive/Anaconda3-2020.11-Linux-x86_64.sh
     sh Anaconda3-2020.11-Linux-x86_64.sh -u
     source ~/.bashrc
@@ -64,6 +68,7 @@ SERVICE=`head -n 1 init/service.out`
 if [ "$SERVICE" = false ] ; then
     echo "Installing Niffler Frontend"
     pip install -r modules/frontend/requirements.txt
+    pip install -i https://test.pypi.org/simple/ HITI-anon-internal
     chmod +x modules/frontend/service/frontend_service.sh
     sudo cp modules/frontend/service/niffler.service /etc/systemd/system/
     sudo systemctl daemon-reload

diff --git a/modules/cold-extraction/ColdDataRetriever.py b/modules/cold-extraction/ColdDataRetriever.py
@@ -14,7 +14,11 @@
 import argparse
 import random
 import string
+import itertools
+import calendar
 
+import pandas as pd
+import numpy as np
 from collections import defaultdict
 
 
@@ -24,7 +28,7 @@ def initialize_config_and_execute(valuesDict):
     """
     global storescp_processes, niffler_processes, nifflerscp_str, niffler_str
     global storage_folder, file_path, csv_file, number_of_query_attributes, first_index, second_index, third_index, \
-        first_attr, second_attr, third_attr, date_format, email, send_email, system_json
+        first_attr, second_attr, third_attr, date_format, email, send_email, system_json, mod_csv_file
     global DCM4CHE_BIN, SRC_AET, QUERY_AET, DEST_AET, NIGHTLY_ONLY, START_HOUR, END_HOUR, IS_EXTRACTION_NOT_RUNNING, \
         NIFFLER_ID, MAX_PROCESSES, SEPARATOR, cfind_add, out_folder
     global firsts, seconds, thirds, niffler_log, resume, length, t_start, cfind_only, cfind_detailed, temp_folder
@@ -43,6 +47,8 @@ def initialize_config_and_execute(valuesDict):
     email = valuesDict['YourEmail']
     send_email = bool(valuesDict['SendEmail'])
     system_json = valuesDict['NifflerSystem']
+    mod_csv_file = csv_file[:-4]+'_mod.csv'
+    shutil.copyfile(csv_file, mod_csv_file)
 
     # Reads the system_json file.
     with open(system_json, 'r') as f:
@@ -81,9 +87,9 @@ def initialize_config_and_execute(valuesDict):
         cfind_add = '-r StudyDescription -x description.csv.xsl'
         out_folder = temp_folder
     elif file_path == cfind_detailed:
-        cfind_add = '-r StudyDescription -r StudyDate -r StudyTime -r DeviceSerialNumber -r ProtocolName ' \
-                    '-r PerformedProcedureStepDescription -r NumberOfStudyRelatedSeries -r  ' \
-                    'NumberOfStudyRelatedInstances -r AcquisitionDate ' \
+        cfind_add = '-r StudyDescription -r StudyDate -r StudyTime -r DeviceSerialNumber ' \
+                    '-r ProtocolName -r PerformedProcedureStepDescription -r NumberOfStudyRelatedSeries ' \
+                    '-r NumberOfStudyRelatedInstances -r AcquisitionDate ' \
                     '-x detailed.csv.xsl'
         out_folder = temp_folder
     else:
@@ -104,13 +110,13 @@ def initialize_config_and_execute(valuesDict):
 
     # All extracted files from the csv file are saved in a respective .pickle file.
     try:
-        with open(csv_file + '.pickle', 'rb') as f:
+        with open(mod_csv_file + '.pickle', 'rb') as f:
             extracted_ones = pickle.load(f)
             # Since we have successfully located a pickle file, it indicates that this is a resume.
             resume = True
     except:
         logging.info("No existing pickle file found. Therefore, initialized with empty value to track the progress to "
-                     "{0}.pickle.".format(csv_file))
+                     "{0}.pickle.".format(mod_csv_file))
 
     # record the start time
     t_start = time.time()
@@ -171,13 +177,51 @@ def initialize():
         subprocess.call("{0}/storescp --accept-unknown --directory {1} --filepath {2} -b {3} > storescp.out &".format(
             DCM4CHE_BIN, storage_folder, file_path, QUERY_AET), shell=True)
 
+def get_all_dates_given_month(string_val):
+    date_format = '%Y%m'
+    dt_stamp = datetime.datetime.strptime(string_val, date_format)
+    month = dt_stamp.month
+    year = dt_stamp.year
+    no_of_days = calendar.monthrange(year, month)[1]
+    first_date = datetime.date(year, month, 1)
+    last_date = datetime.date(year, month, no_of_days)
+    delta = last_date-first_date
+    return (list(first_date+datetime.timedelta(days=i) for i in range(delta.days + 1)))
+
+def handle_study_month(file_path):
+
+    df = pd.read_csv(file_path)
+    if 'StudyMonth' in df.columns:
+
+        for i in range(len(df)):
+            df['StudyMonth'][i] = get_all_dates_given_month(str(df['StudyMonth'][i]))
+
+        for i in range(len(df)):
+            for col in df.columns:
+                if col != 'StudyMonth':
+                    df[col][i] = [str(df[col][i])] * len(df['StudyMonth'][i])
+
+        mod_df = pd.DataFrame(columns=df.columns)
+        for col in df.columns:
+            sample_list = []
+            sample_list.extend(df[col].values)
+            sample_list = list(itertools.chain(*sample_list))
+            mod_df[col] = sample_list
+
+        mod_df['StudyMonth'] = pd.to_datetime(mod_df['StudyMonth'], format='%Y-%m-%d')
+        mod_df = mod_df.rename(columns={'StudyMonth':'StudyDate'})
+        return (mod_df)
+    else:
+        return (df)
 
 def read_csv():
     """
     Read and parse the user provided csv file.
     """
     global length
-    with open(csv_file, newline='') as f:
+    df = handle_study_month(mod_csv_file)
+    df.to_csv(mod_csv_file, index=False)
+    with open(mod_csv_file, newline='') as f:
         reader = csv.reader(f)
         next(f)
 
@@ -224,7 +268,6 @@ def convert_to_date_format(string_val):
     date_str = dt_stamp.strftime('%Y%m%d')
     return date_str
 
-
 def run_retrieval():
     """
     Run the retrieval only once, when the extraction script starts, and keep it running in a separate thread.
@@ -478,7 +521,7 @@ def update_pickle():
     """
     Write the pickle file periodically to track the progress and persist it to the filesystem.
     """
-    with open(csv_file + '.pickle', 'wb') as f:
+    with open(mod_csv_file + '.pickle', 'wb') as f:
         pickle.dump(extracted_ones, f)
     logging.debug('Progress is recorded to the pickle file')
 

diff --git a/modules/cold-extraction/README.md b/modules/cold-extraction/README.md
@@ -71,12 +71,18 @@ AAAAA,BBBBBYYBBBBB
 
 [3]
 PatientID,AccessionNumber,StudyDate
-AAAAA,BBBBBYYBBBBB,CCCCCC
-AAAAA,BBBBBYYBBBBB,CCCCCC 
-AAAAA,BBBBBYYBBBBB,CCCCCC
+AAAAA,BBBBBYYBBBBB,YYYYMMDD
+AAAAA,BBBBBYYBBBBB,YYYYMMDD
+AAAAA,BBBBBYYBBBBB,YYYYMMDD
 
 
-```
+[4]
+PatientID,AccessionNumber,StudyMonth
+AAAAA,BBBBBYYBBBBB,YYYYMM
+AAAAA,BBBBBYYBBBBB,YYYYMM
+AAAAA,BBBBBYYBBBBB,YYYYMM
+
+``` 
 
 ## Configuring Extraction Profile with config.json.
 
@@ -100,6 +106,7 @@ Example: `python3 ./ColdDataRetriever.py --NumberOfQueryAttributes 1 --FirstAttr
 **Please note:** It is important to use the correct DICOM keywords such as, "PatientID", "AccessionNumber", "StudyInstanceUID", and "StudyDate".
   Please refer to the DICOM Standard for more information on the DICOM header attributes/keywords.
   Please note, the correct keyword is "AccessionNumber" and not "Accession" or "Accessions". Similarly, it is "PatientID" - neither "EMPI" nor "Patient-ID" (although they all are indeed the same in practice).
+  Though "StudyMonth" is a non-DICOM attribute, the current version of Niffler supports "StudyMonth" attribute and works similar to "StudyDate" attribute.
 
 Please refer to the DICOM standards to ensure you spell the [DICOM keyword](http://dicom.nema.org/dicom/2013/output/chtml/part06/chapter_6.html) correctly, if in doubt.
 
@@ -115,7 +122,7 @@ Please refer to the DICOM standards to ensure you spell the [DICOM keyword](http
 
 * *ThirdIndex*: Set the CSV column index of third Attribute. By default, 2. This field is ignored when NumberOfQueryAttributes is 1 or 2.
 
-* *DateFormat*: DateFormat can range from %Y%m%d, %m/%d/%y, %m-%d-%y, %%m%d%y, etc. This field is ignored for extractions that do not use a Date as one of their extraction attributes. We have tested with StudyDate. Leave this entry unmodified for such cases. The default is %Y%m%d and works for most cases.
+* *DateFormat*: DateFormat can range from %Y%m%d, %m/%d/%y, %m-%d-%y, %%m%d%y, etc. This field is ignored for extractions that do not use a Date as one of their extraction attributes. We have tested with StudyDate. Leave this entry unmodified for such cases. The default is %Y%m%d and works for most cases. When using StudyMonth attribute, the default is %Y-%m-%d.
 
 * *SendEmail*: Do you want to send an email notification when the extraction completes? The default is true. You may disable this if you do not want to receive an email upon the completion.
 
@@ -154,14 +161,14 @@ Try again later. Once there is no other process, then you can run your own extra
 
 
 ## Check the Progress
-
+ 
 After some time (may take a few hours to a few days, depending on the length of the CSV file), check whether the extraction is complete.
 ```
 $ tail -f niffler.log
 
 INFO:root:[EXTRACTION COMPLETE] 2020-09-21 17:42:38.465501: Niffler Extraction to /opt/data/new-study Completes. Terminating the completed storescp process.
 ```
-A pickle file tracks the progress. The pickle file is created by appending ".pickle" to the csv file name in the same directory as the csv file. A sample pickle line is as below:
+Apart from the original CSV file, a modified version of the CSV file is created depending on the attributes and a pickle file tracks the progress. The pickle file is created by appending ".pickle" to the modified csv file name in the same directory as the csv file. A sample pickle line is as below:
 
 ```
 <8c>^X1234, 000056789<94>
@@ -209,7 +216,7 @@ If you find an error such as: "IndexError: list index out of range", that indica
 
 Fix them and restart your Python process, by first finding and killing your python process and then starting Niffler as before.
 ```
-$ ps -xa | grep python
+$ sudo ps -xa | grep python
 
 1866 ?    Ss   0:00 /usr/bin/python3 /usr/bin/networkd-dispatcher --run-startup-triggers
 
@@ -219,7 +226,7 @@ $ ps -xa | grep python
 
 3384 pts/0  S+   0:00 grep --color=auto python
 
-$ kill 2926
+$ sudo kill 2926
 ```
 You might need to run the above command with sudo to find others' Niffler processes.
 

diff --git a/modules/dicom-anonymization/DicomAnonymizer.py b/modules/dicom-anonymization/DicomAnonymizer.py
@@ -13,7 +13,7 @@
 import glob
 import pathlib
 import pickle
-
+import pydicom.valuerep as pydicom_types
 
 def get_dcm_folders(dcm_root_dir):
     # get all folders
@@ -91,7 +91,7 @@ def dcm_anonymize(dcm_folders, output_path, stop=None):
                  'PhysiciansOfRecord', 'PerformingPhysicianName', 'OperatorsName', 'PatientName', 'PatientID',
                  'IssuerOfPatientID', 'PatientBirthDate', 'PatientSex', 'OtherPatientIDs', 'PatientAge', 'PatientSize',
                  'PatientWeight', 'PatientAddress', 'EthnicGroup', 'PregnancyStatus', 'RequestingPhysician',
-                 'PerformedProcedureStepStartDate', 'PerformedProcedureStepStartTime', 'PerformedProcedureStepID']
+                 'PerformedProcedureStepStartDate', 'PerformedProcedureStepStartTime', 'PerformedProcedureStepID',"PatientTelephoneNumbers"]
 
     # for upto 200 dcm folders
     n = 0
@@ -125,7 +125,9 @@ def dcm_anonymize(dcm_folders, output_path, stop=None):
                             dcm_file.data_element(tag).value = 'N/A'
                         elif type(dcm_file.data_element(tag).value) == int:
                             dcm_file.data_element(tag).value = 0
-                        else:
+                        elif type(dcm_file.data_element(tag).value) == pydicom_types.PersonName:
+                            dcm_file.data_element(tag).value = 'N/A'
+                        else: 
                             dcm_file.data_element(tag).value = 0.0
                 dcm_file.save_as(os.path.join(study_folder, new_filename + '.dcm'))
             n += 1

diff --git a/modules/frontend/requirements.txt b/modules/frontend/requirements.txt
@@ -9,7 +9,7 @@ greenlet==1.1.0
 itsdangerous==2.0.1
 Jinja2==3.0.1
 MarkupSafe==2.0.1
-numpy==1.21.0
+numpy==1.22.0
 pandas==1.2.4
 Pillow==9.0.1
 pydicom==2.1.2