-
Notifications
You must be signed in to change notification settings - Fork 0
/
test_pubmed.py
149 lines (122 loc) · 4.87 KB
/
test_pubmed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# -*- coding: utf-8 -*-
"""
Created on Thu Jun 1 13:24:45 2017
@author: shrey
"""
import time
from Bio import Entrez
import requests
from lxml import etree
from lxml import html
from utils import stringify_children
from nltk import tokenize
def get_links_id(pmid):
link_list = []
links = Entrez.elink(dbfrom="pubmed", id=pmid, linkname="pubmed_pubmed_citedin")
record = Entrez.read(links)
try:
records = record[0][u'LinkSetDb'][0][u'Link']
for link in records:
link_list.append(link[u'Id'])
citations = len(link_list)
except:
citations = 0
return citations
def load_xml(pmid, sleep=None):
"""
Load XML file from given pmid from eutils site
return a dictionary for given pmid and xml string from the site
sleep: how much time we want to wait until requesting new xml
"""
link = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=%s" % str(pmid)
page = requests.get(link)
tree = html.fromstring(page.content)
if sleep is not None:
time.sleep(sleep)
return tree
def parse_pubmed_web_tree(tree):
"""
Giving tree, return simple parsed information from the tree
"""
if tree.xpath('//articletitle') is not None:
title = ' '.join([title.text for title in tree.xpath('//articletitle')])
else:
title = ''
abstract_tree = tree.xpath('//abstract/abstracttext')
abstract = ' '.join([stringify_children(a).strip() for a in abstract_tree])
con=tokenize.sent_tokenize(abstract)
conclusion =''.join(con[len(con)-1])
if tree.xpath('//article//title') is not None:
journal = ';'.join([t.text.strip() for t in tree.xpath('//article//title')])
else:
journal = ''
pubdate = tree.xpath('//pubmeddata//history//pubmedpubdate[@pubstatus="medline"]')
if len(pubdate) >= 1 and pubdate[0].find('year') is not None:
year = pubdate[0].find('year').text
else:
year = ''
affiliations = list()
if tree.xpath('//affiliationinfo/affiliation') is not None:
for affil in tree.xpath('//affiliationinfo/affiliation'):
affiliations.append(affil.text)
affiliations_text = '; '.join(affiliations)
else:
affiliations_text= ''
keywords = list()
if tree.xpath('//keywordlist/keyword')is not None:
for keyword in tree.xpath('//keywordlist/keyword'):
keywords.append(keyword.text)
keywords_text = '; '.join(keywords)
else:
keywords_text= ''
authors_tree = tree.xpath('//authorlist/author')
authors = list()
if authors_tree is not None:
for a in authors_tree:
firstname = a.find('forename').text if a.find('forename') is not None else ''
lastname = a.find('lastname').text if a.find('forename') is not None else ''
fullname = (firstname + ' ' + lastname).strip()
if fullname == '':
fullname = a.find('collectivename').text if a.find('collectivename') is not None else ''
authors.append(fullname)
authors_text = '; '.join(authors)
else:
authors_text = ''
if tree.xpath('//abstract/copyrightinformation') is not None:
copyright_text = ';'.join([c.text.strip() for c in tree.xpath('//abstract/copyrightinformation')])
else:
copyright_text = ''
if tree.xpath('//articleidlist/articleid[@idtype="pubmed"]') is not None:
pmid = ';'.join([p.text.strip() for p in tree.xpath('//articleidlist/articleid[@idtype="pubmed"]')])
else:
pmid = ''
if tree.xpath('//articleidlist/articleid[@idtype="doi"]') is not None:
doi = ';'.join([p.text.strip() for p in tree.xpath('//articleidlist/articleid[@idtype="doi"]')])
else:
doi = ''
n_citations=get_links_id(pmid)
# title,abstract,journal,affiliation,authors,year,keywords,conclusion,copyright,pmid,doi,citationindex
dict_out = {'title': title,
'abstract': abstract,
'journal': journal,
'affiliation': affiliations_text,
'authors': authors_text,
'year': year,
'keywords': keywords_text,
'conclusion':conclusion,
'copyright':copyright_text,
'pmid':pmid,
'doi':doi,
'citation_index':n_citations}
return dict_out
def parse_xml_web(pmid, sleep=None, save_xml=False):
"""
Give pmid, load and parse xml from Pubmed eutils
if save_xml is True, save xml output in dictionary
"""
tree = load_xml(pmid, sleep=sleep)
dict_out = parse_pubmed_web_tree(tree)
dict_out['pmid'] = str(pmid)
if save_xml:
dict_out['xml'] = etree.tostring(tree)
return dict_out