forked from tsutterley/reference-toolkit
-
Notifications
You must be signed in to change notification settings - Fork 0
/
search_references.py
205 lines (191 loc) · 9.16 KB
/
search_references.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
#!/usr/bin/env python
u"""
search_references.py (12/2020)
Reads bibtex files for each article in a given set of years to search for
keywords, authors, journal, etc using regular expressions
CALLING SEQUENCE:
python search_references.py -A Rignot -F -Y 2008 -J "Nature Geoscience"
will return the bibtex entry for this publication
COMMAND LINE OPTIONS:
-A X, --author X: last name of author of publications to search
-F, --first: Search only first authors
-Y X, --year X: years of publication to search (can be regular expression)
-K X, --keyword X: keywords to search
-J X, --journal X: publication journals to search
-D X, --doi X: search for a specific set of DOIs
-O, --open: open publication directory with found matches
-W, --webpage: open publication webpage with found matches
-E X, --export X: export all found matches to a single bibtex file
PROGRAM DEPENDENCIES:
read_referencerc.py: Sets default file path and file format for output files
language_conversion.py: Outputs map for converting symbols between languages
UPDATE HISTORY:
Updated 12/2020: using argparse to set command line options
Updated 07/2019: modifications for python3 string compatibility
Updated 06/2018: added DOI search using the -D or --doi options
Updated 11/2017: added export command to print matches to a single file
Updated 10/2017: use data path and data file format from referencerc file
Updated 07/2017: print number of matching articles in search query
Updated 06/2017: added webbrowser to open the webpage of found articles
Written 06/2017
"""
from __future__ import print_function
import sys
import re
import os
import inspect
import argparse
import posixpath
import subprocess
import webbrowser
from read_referencerc import read_referencerc
from language_conversion import language_conversion
#-- current file path for the program
filename = inspect.getframeinfo(inspect.currentframe()).filename
filepath = os.path.dirname(os.path.abspath(filename))
#-- Reads bibtex files for each article stored in the working directory for
#-- keywords, authors, journal, etc
def search_references(AUTHOR, JOURNAL, YEAR, KEYWORDS, DOI, FIRST=False,
OPEN=False, WEBPAGE=False, EXPORT=None):
#-- get reference filepath and reference format from referencerc file
datapath,dataformat=read_referencerc(os.path.join(filepath,'.referencerc'))
#-- bibtex fields to be printed in the output file
bibtex_field_types = ['address','affiliation','annote','author',
'booktitle','chapter','crossref','doi','edition','editor',
'howpublished','institution','isbn','issn','journal','key',
'keywords','month','note','number','organization','pages',
'publisher','school','series','title','type','url','volume','year']
field_regex = '[\s]?(' + '|'.join([i for i in bibtex_field_types]) + \
')[\s]?\=[\s]?[\{]?[\{]?(.*?)[\}]?[\}]?[\,]?[\s]?\n'
R1 = re.compile(field_regex, flags=re.IGNORECASE)
#-- compile regular expression operators for input search terms
if AUTHOR and FIRST:
R2 = re.compile('^'+'|'.join(AUTHOR), flags=re.IGNORECASE)
elif AUTHOR:
R2 = re.compile('|'.join(AUTHOR), flags=re.IGNORECASE)
if JOURNAL:
R3 = re.compile('|'.join(JOURNAL), flags=re.IGNORECASE)
if KEYWORDS:
R4 = re.compile('|'.join(KEYWORDS), flags=re.IGNORECASE)
#-- if exporting matches to a single file or standard output (to terminal)
fid = open(os.path.expanduser(EXPORT), 'w') if EXPORT else sys.stdout
#-- find directories of years
regex_years = '|'.join(YEAR) if YEAR else '\d+'
years = [sd for sd in os.listdir(datapath) if re.match(regex_years,sd) and
os.path.isdir(os.path.join(datapath,sd))]
match_count = 0
query_count = 0
for Y in sorted(years):
#-- find author directories in year
authors = [sd for sd in os.listdir(os.path.join(datapath,Y)) if
os.path.isdir(os.path.join(datapath,Y,sd))]
for A in sorted(authors):
#-- find bibtex files
bibtex_files = [fi for fi in os.listdir(os.path.join(datapath,Y,A))
if re.match('(.*?)-(.*?).bib$',fi)]
#-- read each bibtex file
for fi in bibtex_files:
with open(os.path.join(datapath,Y,A,fi), 'r') as f:
bibtex_entry = f.read()
#-- extract bibtex fields
bibtex_field_entries = R1.findall(bibtex_entry)
entry = {}
for key,val in bibtex_field_entries:
#-- replace latex symbols with unicode characters
#-- 1: latex, 2: combining unicode, 3: unicode, 4: plain
if sys.version_info[0] == 2:
val = val.decode('unicode-escape')
for LV, CV, UV, PV in language_conversion():
val = val.replace(LV,CV)
#-- add to current entry dictionary
entry[key.lower()] = val
#-- use search terms to find journals
#-- Search bibtex author entries for AUTHOR
F1 = R2.search(entry['author']) if AUTHOR else True
#-- Search bibtex journal entries for JOURNAL
F2 = False if JOURNAL else True
if ('journal' in entry.keys() and JOURNAL):
F2 = R3.search(entry['journal'])
#-- Search bibtex title entries for KEYWORDS
F3 = R4.search(entry['title']) if KEYWORDS else True
#-- Search bibtex keyword entries for KEYWORDS
F4 = False if KEYWORDS else True
if ('keywords' in entry.keys() and KEYWORDS):
F4 = R4.search(entry['keywords'])
#-- Search bibtex DOI entries for a specific set of DOI's
F5 = False if DOI else True
if ('doi' in entry.keys() and DOI):
F5 = entry['doi'] in DOI
#-- print the complete bibtex entry if search was found
if bool(F1) & bool(F2) & (bool(F3) | bool(F4)) & bool(F5):
print(bibtex_entry, file=fid)
file_opener(os.path.join(datapath,Y,A,fi)) if OPEN else None
#-- URL to open if WEBPAGE (from url or using doi)
if 'url' in entry.keys():
URL = entry['url']
elif 'doi' in entry.keys():
URL = posixpath.join('https://doi.org',entry['doi'])
#-- Open URL in a new tab, if browser window is already open
webbrowser.open_new_tab(URL) if (WEBPAGE and URL) else None
#-- add to total match count
match_count += 1
#-- add to total query count
query_count += 1
#-- print the number of matching and number of queried references
args = (match_count, query_count)
print('Matching references = {0:d} out of {1:d} queried'.format(*args))
#-- close the exported bibtex file
fid.close() if EXPORT else None
#-- PURPOSE: platform independent file opener
def file_opener(filename):
if (sys.platform == "win32"):
os.startfile(filename, "explore")
elif (sys.platform == "darwin"):
subprocess.call(["open", "-R", filename])
else:
subprocess.call(["xdg-open", filename])
#-- main program that calls search_references()
def main():
#-- Read the system arguments listed after the program
parser = argparse.ArgumentParser(
description="""Reads BibTeX files for each article in a given set of
years to search for keywords, authors, journal, etc using regular
expressions
"""
)
#-- command line parameters
parser.add_argument('--author','-A',
type=str, nargs='+',
help='Author of publications to search')
parser.add_argument('--first','-F',
default=False, action='store_true',
help='Search only lead authors')
parser.add_argument('--journal','-J',
type=str, nargs='+',
help='Publication journals to search')
parser.add_argument('--year','-Y',
type=str, nargs='+',
help='Years of publication to search')
parser.add_argument('--keyword','-K',
type=str, nargs='+',
help='Keywords to search')
parser.add_argument('--doi','-D',
type=str, nargs='+',
help='Search for specific Digital Object Identifiers (DOIs)')
parser.add_argument('--open','-O',
default=False, action='store_true',
help='Open publication directory with found matches')
parser.add_argument('--webpage','-W',
default=False, action='store_true',
help='Open publication webpage with found matches')
parser.add_argument('--export','-E',
type=lambda p: os.path.abspath(os.path.expanduser(p)),
help='Export found matches to a single BibTeX file')
args = parser.parse_args()
#-- search references for requested fields
search_references(args.author, args.journal, args.year, args.keyword,
args.doi, FIRST=args.first, OPEN=args.open, WEBPAGE=args.webpage,
EXPORT=args.export)
#-- run main program
if __name__ == '__main__':
main()