forked from tsutterley/reference-toolkit
-
Notifications
You must be signed in to change notification settings - Fork 0
/
copy_journal_articles.py
195 lines (174 loc) · 7.99 KB
/
copy_journal_articles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
#!/usr/bin/env python
u"""
copy_journal_articles.py (12/2020)
Copies journal articles and supplements from a website to a local directory
Enter Author names, journal name, publication year and volume will copy a pdf
file (or any other file if supplement) to the reference path
CALLING SEQUENCE:
python copy_journal_articles.py --author Rignot --year 2008 \
--journal "Nature Geoscience" --volume 1 \
https://www.nature.com/ngeo/journal/v1/n2/pdf/ngeo102.pdf
will download the copy to 2008/Rignot/Rignot_Nat._Geosci.-1_2008.pdf
INPUTS:
url to file to be copied into the reference path
COMMAND LINE OPTIONS:
-A X, --author X: lead author of publication
-J X, --journal X: corresponding publication journal
-Y X, --year X: corresponding publication year
-V X, --volume X: corresponding publication volume
-N X, --number X: Corresponding publication number
-S, --supplement: file is a supplemental file
PROGRAM DEPENDENCIES:
read_referencerc.py: Sets default file path and file format for output files
language_conversion.py: Outputs map for converting symbols between languages
NOTES:
Lists of journal abbreviations
https://github.com/JabRef/abbrv.jabref.org/tree/master/journals
If using author name with unicode characters: put in quotes and check
unicode characters with http://www.fileformat.info/
UPDATE HISTORY:
Updated 12/2020: using argparse to set command line options
Updated 07/2019: modifications for python3 string compatibility
Updated 07/2018: using urllib.request for python3
Updated 10/2017: use data path and data file format from referencerc file
Updated 09/2017: use timeout of 20 to prevent socket.timeout
Updated 05/2017: Convert special characters with language_conversion program
Written 05/2017
"""
from __future__ import print_function
import sys
import os
import re
import ssl
import shutil
import inspect
import argparse
from read_referencerc import read_referencerc
from language_conversion import language_conversion
if sys.version_info[0] == 2:
import urllib2
else:
import urllib.request as urllib2
#-- current file path for the program
filename = inspect.getframeinfo(inspect.currentframe()).filename
filepath = os.path.dirname(os.path.abspath(filename))
#-- PURPOSE: check internet connection and URL
def check_connection(remote_file):
#-- attempt to connect to remote file
try:
urllib2.urlopen(remote_file, timeout=20, context=ssl.SSLContext())
except urllib2.HTTPError:
raise RuntimeError('Check URL: {0}'.format(remote_file))
except urllib2.URLError:
raise RuntimeError('Check internet connection')
else:
return True
#-- PURPOSE: create directories and copy a reference file after formatting
def copy_journal_articles(remote,author,journal,year,volume,number,SUPPLEMENT):
#-- get reference filepath and reference format from referencerc file
datapath,dataformat=read_referencerc(os.path.join(filepath,'.referencerc'))
#-- input remote file scrubbed of any additional html information
fi = re.sub('\?[\_a-z]{1,4}\=(.*?)$','',remote)
#-- get extension from file (assume pdf if extension cannot be extracted)
fileExtension=os.path.splitext(fi)[1] if os.path.splitext(fi)[1] else '.pdf'
#-- file listing journal abbreviations modified from
#-- https://github.com/JabRef/abbrv.jabref.org/tree/master/journals
abbreviation_file = 'journal_abbreviations_webofscience-ts.txt'
#-- create regular expression pattern for extracting abbreviations
arg = journal.replace(' ','\s+')
rx=re.compile('\n{0}[\s+]?\=[\s+]?(.*?)\n'.format(arg),flags=re.IGNORECASE)
#-- try to find journal article within filename from webofscience file
with open(os.path.join(filepath,abbreviation_file),'r') as f:
abbreviation_contents = f.read()
#-- if abbreviation not found: just use the whole journal name
#-- else use the found journal abbreviation
if not bool(rx.search(abbreviation_contents)):
print('Abbreviation for {0} not found'.format(journal))
abbreviation = journal
else:
abbreviation = rx.findall(abbreviation_contents)[0]
#-- replace unicode characters with combining unicode version
if sys.version_info[0] == 2:
author = author.decode('unicode-escape')
#-- 1st column: latex, 2nd: combining unicode, 3rd: unicode, 4th: plain text
for LV, CV, UV, PV in language_conversion():
author = author.replace(UV, CV)
#-- directory path for local file
if SUPPLEMENT:
directory = os.path.join(datapath,year,author,'Supplemental')
else:
directory = os.path.join(datapath,year,author)
#-- check if output directory currently exist and recursively create if not
os.makedirs(directory) if not os.path.exists(directory) else None
#-- format used for saving articles using string formatter
#-- 0) Author Last Name
#-- 1) Journal Name
#-- 2) Journal Abbreviation
#-- 3) Publication Volume
#-- 4) Publication Number
#-- 5) Publication Year
#-- 6) File Extension (will include period)
#-- initial test case for output file (will add numbers if not unique in fs)
args = (author, journal.replace(' ','_'), abbreviation.replace(' ','_'),
volume, number, year, fileExtension)
local_file = os.path.join(directory, dataformat.format(*args))
#-- chunked transfer encoding size
CHUNK = 16 * 1024
#-- open url and copy contents to local file using chunked transfer encoding
#-- transfer should work properly with ascii and binary data formats
headers = {'User-Agent':"Magic Browser"}
request = urllib2.Request(remote, headers=headers)
f_in = urllib2.urlopen(request, timeout=20, context=ssl.SSLContext())
with create_unique_filename(local_file) as f_out:
shutil.copyfileobj(f_in, f_out, CHUNK)
f_in.close()
#-- PURPOSE: open a unique filename adding a numerical instance if existing
def create_unique_filename(filename):
#-- split filename into fileBasename and fileExtension
fileBasename, fileExtension = os.path.splitext(filename)
#-- create counter to add to the end of the filename if existing
counter = 1
while counter:
try:
#-- open file descriptor only if the file doesn't exist
fd = os.open(filename, os.O_CREAT | os.O_EXCL | os.O_RDWR)
except OSError:
pass
else:
print(filename.replace(os.path.expanduser('~'),'~'))
return os.fdopen(fd, 'wb+')
#-- new filename adds counter the between fileBasename and fileExtension
filename = u'{0}-{1:d}{2}'.format(fileBasename, counter, fileExtension)
counter += 1
#-- main program that calls copy_journal_articles()
def main():
#-- Read the system arguments listed after the program
parser = argparse.ArgumentParser(
description="""Copies a journal article from a website to the reference
local directory
"""
)
#-- command line parameters
parser.add_argument('url',
type=str, help='url to article to be copied into the reference path')
parser.add_argument('--author','-A',
type=str, help='Lead author of publication')
parser.add_argument('--journal','-J',
type=str, help='Corresponding publication journal')
parser.add_argument('--year','-Y',
type=str, help='Corresponding publication year')
parser.add_argument('--volume','-V',
type=str, default='', help='Corresponding publication volume')
parser.add_argument('--number','-N',
type=str, default='', help='Corresponding publication number')
parser.add_argument('--supplement','-S',
default=False, action='store_true',
help='File is an article supplement')
args = parser.parse_args()
#-- check connection to url and then download article
if check_connection(args.url):
copy_journal_articles(args.url, args.author, args.journal, args.year,
args.volume, args.number, args.supplement)
#-- run main program
if __name__ == '__main__':
main()