diff --git a/data_import/filters.py b/data_import/filters.py new file mode 100644 index 000000000..a4693ffaa --- /dev/null +++ b/data_import/filters.py @@ -0,0 +1,53 @@ +import datetime + +import arrow + +from rest_framework.filters import BaseFilterBackend + + +class AccessLogFilter(BaseFilterBackend): + """ + Used for filtering data returned by the custom API for OHLOG_PROJECT_ID. + """ + + def filter_queryset(self, request, queryset, view): + + start_date = request.query_params.get("start_date", None) + end_date = request.query_params.get("end_date", None) + if start_date: + try: + start_date = arrow.get(start_date).datetime + except (TypeError, ValueError): + start_date = None + if end_date: + try: + end_date = arrow.get(end_date).datetime + # Special check if start_date and end_date is the same + # If this is the case, assume that a 24 hour period is meant, and set end_time accordingly + if start_date == end_date: + end_date = end_date + datetime.timedelta( + hours=23, minutes=59, seconds=59 + ) + except (TypeError, ValueError): + end_date = None + if queryset.model.__name__ == "AWSDataFileAccessLog": + # AWS uses 'time' for the timestamp rather than 'date' + if start_date: + queryset = queryset.filter(time__gte=start_date) + if end_date: + queryset = queryset.filter(time__lte=end_date) + else: + if start_date: + queryset = queryset.filter(date__gte=start_date) + if end_date: + queryset = queryset.filter(date__lte=end_date) + + datafile_id = request.query_params.get("datafile_id", None) + if datafile_id: + try: + datafile_id = int(datafile_id) + queryset = queryset.filter(serialized_data_file__id=datafile_id) + except ValueError: + pass + + return queryset diff --git a/data_import/migrations/0018_auto_20190402_1947.py b/data_import/migrations/0018_auto_20190402_1947.py new file mode 100644 index 000000000..f88ff98cb --- /dev/null +++ b/data_import/migrations/0018_auto_20190402_1947.py @@ -0,0 +1,21 @@ +# Generated by Django 2.1.3 on 2019-04-02 19:47 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [("data_import", "0017_auto_20190329_1638")] + + operations = [ + migrations.AlterField( + model_name="awsdatafileaccesslog", + name="bytes_sent", + field=models.BigIntegerField(null=True), + ), + migrations.AlterField( + model_name="awsdatafileaccesslog", + name="object_size", + field=models.BigIntegerField(null=True), + ), + ] diff --git a/data_import/migrations/0018_datatype.py b/data_import/migrations/0019_datatype.py similarity index 95% rename from data_import/migrations/0018_datatype.py rename to data_import/migrations/0019_datatype.py index a2e912487..caf979b20 100644 --- a/data_import/migrations/0018_datatype.py +++ b/data_import/migrations/0019_datatype.py @@ -1,4 +1,4 @@ -# Generated by Django 2.1.7 on 2019-04-02 17:45 +# Generated by Django 2.1.7 on 2019-04-10 22:40 import django.contrib.postgres.fields.jsonb import django.core.validators @@ -10,7 +10,7 @@ class Migration(migrations.Migration): dependencies = [ ("open_humans", "0014_member_password_reset_redirect"), - ("data_import", "0017_auto_20190329_1638"), + ("data_import", "0018_auto_20190402_1947"), ] operations = [ diff --git a/data_import/models.py b/data_import/models.py index 1bf7daecf..f2c81ef3e 100644 --- a/data_import/models.py +++ b/data_import/models.py @@ -240,8 +240,8 @@ class AWSDataFileAccessLog(models.Model): request_uri = models.CharField(max_length=500, null=True) status = models.IntegerField(null=True) error_code = models.CharField(max_length=64, null=True) - bytes_sent = models.IntegerField(null=True) - object_size = models.IntegerField(null=True) + bytes_sent = models.BigIntegerField(null=True) + object_size = models.BigIntegerField(null=True) total_time = models.IntegerField(null=True) turn_around_time = models.IntegerField(null=True) referrer = models.CharField(max_length=500, null=True) diff --git a/data_import/permissions.py b/data_import/permissions.py new file mode 100644 index 000000000..bd5283e44 --- /dev/null +++ b/data_import/permissions.py @@ -0,0 +1,15 @@ +from django.conf import settings + +from rest_framework.permissions import BasePermission + + +class LogAPIAccessAllowed(BasePermission): + """ + Return True if the request is from OHLOG_PROJECT_ID. + """ + + def has_permission(self, request, view): + if settings.OHLOG_PROJECT_ID: + if request.auth.id == int(settings.OHLOG_PROJECT_ID): + return True + return False diff --git a/data_import/serializers.py b/data_import/serializers.py index 984edd09d..a566e4fee 100644 --- a/data_import/serializers.py +++ b/data_import/serializers.py @@ -1,10 +1,11 @@ from collections import OrderedDict +from urllib.parse import urlparse, parse_qs from rest_framework import serializers from private_sharing.models import DataRequestProject -from .models import DataFile, DataType +from .models import AWSDataFileAccessLog, DataFile, DataType, NewDataFileAccessLog def serialize_datafile_to_dict(datafile): @@ -47,6 +48,49 @@ def to_representation(self, instance): return ret +class NewDataFileAccessLogSerializer(serializers.ModelSerializer): + """ + Serialize logs of file access requests for custom API endpoint for OHLOG_PROJECT_ID + """ + + user = serializers.IntegerField(source="user.id", allow_null=True, default=None) + datafile = serializers.JSONField(source="serialized_data_file") + key = serializers.JSONField(source="data_file_key") + + class Meta: # noqa: D101 + model = NewDataFileAccessLog + fields = ["date", "ip_address", "user", "datafile", "key", "aws_url"] + + +class AWSDataFileAccessLogSerializer(serializers.ModelSerializer): + """ + Serialize logs of AWS file access events for custom API endpoint for OHLOG_PROJECT_ID + """ + + datafile = serializers.JSONField(source="serialized_data_file") + + class Meta: # noqa: D101 + model = AWSDataFileAccessLog + fields = [ + "time", + "remote_ip", + "request_id", + "operation", + "bucket_key", + "request_uri", + "status", + "bytes_sent", + "object_size", + "total_time", + "turn_around_time", + "referrer", + "user_agent", + "cipher_suite", + "host_header", + "datafile", + ] + + class DataTypeSerializer(serializers.ModelSerializer): """ Serialize DataTypes diff --git a/data_import/urls.py b/data_import/urls.py index 4db779f3c..e407abce4 100644 --- a/data_import/urls.py +++ b/data_import/urls.py @@ -1,11 +1,13 @@ -from django.urls import re_path +from django.urls import path, re_path from .views import ( + AWSDataFileAccessLogView, DataFileDownloadView, DataTypesCreateView, DataTypesDetailView, DataTypesListView, DataTypesUpdateView, + NewDataFileAccessLogView, ) app_name = "data-management" @@ -16,6 +18,7 @@ DataFileDownloadView.as_view(), name="datafile-download", ), + # DataTypes paths re_path( r"^datatypes/create/", DataTypesCreateView.as_view(), name="datatypes-create" ), @@ -30,4 +33,15 @@ name="datatypes-detail", ), re_path(r"^datatypes/", DataTypesListView.as_view(), name="datatypes-list"), + # Custom API endpoints for OHLOG_PROJECT_ID + path( + "awsdatafileaccesslog/", + AWSDataFileAccessLogView.as_view(), + name="awsdatafileaccesslog", + ), + path( + "newdatafileaccesslog/", + NewDataFileAccessLogView.as_view(), + name="newdatafileaccesslog", + ), ] diff --git a/data_import/views.py b/data_import/views.py index 5f7e5064f..3f8aaa0cd 100644 --- a/data_import/views.py +++ b/data_import/views.py @@ -5,13 +5,29 @@ from django.urls import reverse from django.views.generic import CreateView, DetailView, TemplateView, UpdateView, View +from django_filters.rest_framework import DjangoFilterBackend from ipware.ip import get_ip +from rest_framework.generics import ListAPIView from common.mixins import NeverCacheMixin, PrivateMixin - +from data_import.serializers import ( + AWSDataFileAccessLogSerializer, + NewDataFileAccessLogSerializer, + serialize_datafile_to_dict, +) +from private_sharing.api_authentication import CustomOAuth2Authentication +from private_sharing.api_permissions import HasValidProjectToken + +from .filters import AccessLogFilter from .forms import DataTypeForm -from .models import DataFile, DataFileKey, DataType, NewDataFileAccessLog -from data_import.serializers import serialize_datafile_to_dict +from .models import ( + AWSDataFileAccessLog, + DataFile, + DataFileKey, + DataType, + NewDataFileAccessLog, +) +from .permissions import LogAPIAccessAllowed UserModel = get_user_model() @@ -81,6 +97,42 @@ def get(self, request, *args, **kwargs): ) +class NewDataFileAccessLogView(NeverCacheMixin, ListAPIView): + """ + Custom API endpoint returning logs of file access requests for OHLOG_PROJECT_ID + """ + + authentication_classes = (CustomOAuth2Authentication,) + filter_backends = (AccessLogFilter, DjangoFilterBackend) + filterset_fields = ("date",) + permission_classes = (HasValidProjectToken, LogAPIAccessAllowed) + serializer_class = NewDataFileAccessLogSerializer + + def get_queryset(self): + queryset = NewDataFileAccessLog.objects.filter( + serialized_data_file__user_id=self.request.user.id + ) + return queryset + + +class AWSDataFileAccessLogView(NeverCacheMixin, ListAPIView): + """ + Custom API endpoint returning logs of AWS file access events for OHLOG_PROJECT_ID + """ + + authentication_classes = (CustomOAuth2Authentication,) + filter_backends = (AccessLogFilter, DjangoFilterBackend) + filterset_fields = ("time",) + permission_classes = (HasValidProjectToken, LogAPIAccessAllowed) + serializer_class = AWSDataFileAccessLogSerializer + + def get_queryset(self): + queryset = AWSDataFileAccessLog.objects.filter( + serialized_data_file__user_id=self.request.user.id + ) + return queryset + + class DataTypesListView(NeverCacheMixin, TemplateView): """ List all DataTypes. diff --git a/open_humans/management/commands/vacuum_log_bucket.py b/open_humans/management/commands/vacuum_log_bucket.py index a3c5857bc..900f13420 100644 --- a/open_humans/management/commands/vacuum_log_bucket.py +++ b/open_humans/management/commands/vacuum_log_bucket.py @@ -70,7 +70,7 @@ def handle(self, *args, **options): ] for index, field_name in enumerate(fields): field = aws_log_entry._meta.get_field(field_name) - if field.get_internal_type() == "IntegerField": + if "IntegerField" in field.get_internal_type(): log_item = log[index] if (log_item == "-") or (log_item == '"-"'): log_item = 0 @@ -118,6 +118,8 @@ def handle(self, *args, **options): # Filter out things we don't care to log if settings.AWS_STORAGE_BUCKET_NAME in url: continue + if "GET" not in str(aws_log_entry.operation): + continue if any( blacklist_item in url for blacklist_item in AWS_LOG_KEY_BLACKLIST ): diff --git a/open_humans/settings.py b/open_humans/settings.py index 0df09111d..2956e3321 100644 --- a/open_humans/settings.py +++ b/open_humans/settings.py @@ -474,6 +474,8 @@ def to_bool(env, default="false"): RECAPTCHA_PUBLIC_KEY = os.getenv("RECAPTCHA_PUBLIC_KEY", "") RECAPTCHA_PRIVATE_KEY = os.getenv("RECAPTCHA_PRIVATE_KEY", "") +OHLOG_PROJECT_ID = os.getenv("OHLOG_PROJECT_ID", None) + ZAPIER_WEBHOOK_URL = os.getenv("ZAPIER_WEBHOOK_URL") MAX_UNAPPROVED_MEMBERS = int(os.getenv("MAX_UNAPPROVED_MEMBERS", "20")) diff --git a/private_sharing/migrations/0021_auto_20190402_1745.py b/private_sharing/migrations/0021_auto_20190410_2240.py similarity index 89% rename from private_sharing/migrations/0021_auto_20190402_1745.py rename to private_sharing/migrations/0021_auto_20190410_2240.py index 556dc2887..3b6c3e963 100644 --- a/private_sharing/migrations/0021_auto_20190402_1745.py +++ b/private_sharing/migrations/0021_auto_20190410_2240.py @@ -1,4 +1,4 @@ -# Generated by Django 2.1.7 on 2019-04-02 17:45 +# Generated by Django 2.1.7 on 2019-04-10 22:40 from django.db import migrations, models @@ -6,7 +6,7 @@ class Migration(migrations.Migration): dependencies = [ - ("data_import", "0018_datatype"), + ("data_import", "0019_datatype"), ("private_sharing", "0020_auto_20190222_0036"), ] diff --git a/requirements.in b/requirements.in index 9d53e1454..8a69f6a8e 100644 --- a/requirements.in +++ b/requirements.in @@ -37,6 +37,7 @@ Markdown mock Pillow # for sorl-thumbnail pyparsing +python-dateutil raven redis requests