From 98709eb5b86f6c89ac9cf448900fb6420d0353f2 Mon Sep 17 00:00:00 2001 From: jimenaRL Date: Tue, 17 Oct 2023 17:21:18 +0200 Subject: [PATCH] Setup a default hints value, add dev Dockerfile and conf, modify url hal contrsuction for both dev (medialab) and full (sciences po) dump getting --- .gitignore | 1 + Dockerfile_dev | 9 ++++++++- config.yaml | 2 ++ config_dev.yaml | 46 ++++++++++++++++++++++++++++++++++++++++++++ get_dump.py | 4 ++-- python/halexp/app.py | 2 +- 6 files changed, 60 insertions(+), 4 deletions(-) create mode 100644 config_dev.yaml diff --git a/.gitignore b/.gitignore index e191938..bd7a523 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ *hnswlib.index *hal-productions.json +*hal-productions_medialab.json # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/Dockerfile_dev b/Dockerfile_dev index f4adeb3..de4fc33 100644 --- a/Dockerfile_dev +++ b/Dockerfile_dev @@ -84,7 +84,14 @@ RUN apt-get update && \ RUN git clone https://github.com/medialab/halexp.git WORKDIR /halexp +# download sBert models +RUN python -c "from sentence_transformers import SentenceTransformer; sBert = SentenceTransformer('distiluse-base-multilingual-cased-v1')" + ENV APPCONFIG=/halexp/config.yaml ENV FLASK_APP=/halexp/python/halexp/app.py -#CMD ["bash", "start.sh"] +# get HAL dump +RUN python get_dump.py --config=config_dev.yaml + +# run server +CMD ["flask", "run", "--host=0.0.0.0", "--port=80", "--debugger"] diff --git a/config.yaml b/config.yaml index 679ea8b..5a21ae6 100644 --- a/config.yaml +++ b/config.yaml @@ -1,4 +1,5 @@ app: + default_nb_hits: 5 style: imageWidth: 450 logoUrl: https://medialab.sciencespo.fr/static/logo_medialab_d4a4a5af-92bb-4651-97e7-22272a5a5d3f.png @@ -6,6 +7,7 @@ corpus: dump_file: hal-productions.json baseUrl: https://api.archives-ouvertes.fr portail: sciencespo + query: '*:*' pagination_count: 10000 fields: - sciencespoId_s diff --git a/config_dev.yaml b/config_dev.yaml new file mode 100644 index 0000000..bc9c861 --- /dev/null +++ b/config_dev.yaml @@ -0,0 +1,46 @@ +app: + default_nb_hits: 5 + style: + imageWidth: 450 + logoUrl: https://medialab.sciencespo.fr/static/logo_medialab_d4a4a5af-92bb-4651-97e7-22272a5a5d3f.png +corpus: + dump_file: hal-productions.json + baseUrl: https://api.archives-ouvertes.fr + portail: index + query: 'labStructId_i:394361' + pagination_count: 10000 + fields: + - sciencespoId_s + - halId_s + - uri_s + - docType_s + - language_s + - title_s + - subtitle_s + - abstract_s + - description_s + - en_title_s + - en_subTitle_s + - en_abstract_s + - en_description_s + - fr_title_s + - fr_subTitle_s + - fr_abstract_s + - fr_description_s + - modifiedDate_s + - submittedDate_s + - releasedDate_s + - producedDate_s + - publicationDate_s + - ePublicationDate_s + - conferenceStartDate_s + - conferenceEndDate_s + - writingDate_s + - defenseDate_s + - authFirstName_s + - authLastName_s + - citationFull_s +index: + hnswlib_space: cosine + sentence_transformer_model: distiluse-base-multilingual-cased-v1 + diff --git a/get_dump.py b/get_dump.py index 1b65644..c855f65 100644 --- a/get_dump.py +++ b/get_dump.py @@ -18,11 +18,11 @@ BASE_URL = params['baseUrl'] PAGINATION_COUNT = params['pagination_count'] -QUERY = "*:*" +QUERY = params['query'] FL_PARAM = '&fl='+','.join(params['fields']) PORTAIL = params['portail'] -base_url = f"{BASE_URL}/search/{PORTAIL}/?q{QUERY}" +base_url = f"{BASE_URL}/search/{PORTAIL}/?q={QUERY}" base_url += f"&wt=json&fl={FL_PARAM}" base_url += f"&rows={PAGINATION_COUNT}&sort=docid+asc" diff --git a/python/halexp/app.py b/python/halexp/app.py index 8912b79..d9806b0 100644 --- a/python/halexp/app.py +++ b/python/halexp/app.py @@ -75,7 +75,7 @@ def query(): return {'error': 'Missing `query` argument in query string'} nb_hits = request.args.get('hits') if nb_hits is None: - return {'error': 'Missing `hits` argument in query string'} + nb_hits = params['app']['default_nb_hits'] res = index.retrieve(query=query, top_k=castInt(nb_hits))