From 3ab222edc95df2e02ec5125f2804ebe5f10de0b0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oscar=20Arbel=C3=A1ez?= <odarbelaeze@gmail.com>
Date: Mon, 1 Jun 2020 16:59:39 -0500
Subject: [PATCH 01/35] Making stuff a little bit more concrete

---
 setup.py               |   2 +-
 tests/conftest.py      |   2 +-
 tests/test_wostools.py | 887 +++--------------------------------------
 wostools/__init__.py   |   5 +-
 wostools/article.py    | 125 ++++++
 wostools/cli.py        |  51 +--
 wostools/fields.py     |  59 ++-
 wostools/lazy.py       | 145 +++++++
 wostools/wostools.py   | 330 ---------------
 9 files changed, 382 insertions(+), 1224 deletions(-)
 create mode 100644 wostools/article.py
 create mode 100644 wostools/lazy.py
 delete mode 100644 wostools/wostools.py

diff --git a/setup.py b/setup.py
index 178c384..9a53733 100644
--- a/setup.py
+++ b/setup.py
@@ -36,7 +36,7 @@
     include_package_data=True,
     keywords="wostools",
     name="wostools",
-    packages=find_packages(include=["wostools"]),
+    packages=find_packages(include=["wostools", "wostools.*"]),
     setup_requires=setup_requirements,
     test_suite="tests",
     tests_require=test_requirements,
diff --git a/tests/conftest.py b/tests/conftest.py
index 7f64296..5003507 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -91,7 +91,7 @@ def article():
         "ER"
     )
     article_text = file.read()
-    return Article(article_text)
+    return Article.from_isi_text(article_text)
 
 
 @pytest.fixture
diff --git a/tests/test_wostools.py b/tests/test_wostools.py
index 9db8f17..7236270 100644
--- a/tests/test_wostools.py
+++ b/tests/test_wostools.py
@@ -18,605 +18,71 @@ def test_article_label(article):
     )
 
 
-def test_aliases(article):
-    if hasattr(article, "AB"):
-        assert article.AB == article.abstract
-    else:
-        with pytest.raises(AttributeError):
-            article.AB
-    if hasattr(article, "AF"):
-        assert article.AF == article.author_full_names
-    else:
-        with pytest.raises(AttributeError):
-            article.AF
-    if hasattr(article, "AR"):
-        assert article.AR == article.article_number
-    else:
-        with pytest.raises(AttributeError):
-            article.AR
-    if hasattr(article, "AU"):
-        assert article.AU == article.authors
-    else:
-        with pytest.raises(AttributeError):
-            article.AU
-    if hasattr(article, "BA"):
-        assert article.BA == article.book_authors
-    else:
-        with pytest.raises(AttributeError):
-            article.BA
-    if hasattr(article, "BE"):
-        assert article.BE == article.editors
-    else:
-        with pytest.raises(AttributeError):
-            article.BE
-    if hasattr(article, "BF"):
-        assert article.BF == article.book_authors_full_name
-    else:
-        with pytest.raises(AttributeError):
-            article.BF
-    if hasattr(article, "BN"):
-        assert article.BN == article.international_standard_book_number
-    else:
-        with pytest.raises(AttributeError):
-            article.BN
-    if hasattr(article, "BP"):
-        assert article.BP == article.beginning_page
-    else:
-        with pytest.raises(AttributeError):
-            article.BP
-    if hasattr(article, "BS"):
-        assert article.BS == article.book_series_subtitle
-    else:
-        with pytest.raises(AttributeError):
-            article.BS
-    if hasattr(article, "C1"):
-        assert article.C1 == article.author_address
-    else:
-        with pytest.raises(AttributeError):
-            article.C1
-    if hasattr(article, "CA"):
-        assert article.CA == article.group_authors
-    else:
-        with pytest.raises(AttributeError):
-            article.CA
-    if hasattr(article, "CL"):
-        assert article.CL == article.conference_location
-    else:
-        with pytest.raises(AttributeError):
-            article.CL
-    if hasattr(article, "CR"):
-        assert article.CR == article.cited_references
-    else:
-        with pytest.raises(AttributeError):
-            article.CR
-    if hasattr(article, "CR"):
-        assert article.CR == article.references
-    else:
-        with pytest.raises(AttributeError):
-            article.CR
-    if hasattr(article, "CR"):
-        assert article.CR == article.citations
-    else:
-        with pytest.raises(AttributeError):
-            article.CR
-    if hasattr(article, "CT"):
-        assert article.CT == article.conference_title
-    else:
-        with pytest.raises(AttributeError):
-            article.CT
-    if hasattr(article, "CY"):
-        assert article.CY == article.conference_date
-    else:
-        with pytest.raises(AttributeError):
-            article.CY
-    if hasattr(article, "DE"):
-        assert article.DE == article.author_keywords
-    else:
-        with pytest.raises(AttributeError):
-            article.DE
-    if hasattr(article, "DI"):
-        assert article.DI == article.digital_object_identifier
-    else:
-        with pytest.raises(AttributeError):
-            article.DI
-    if hasattr(article, "DT"):
-        assert article.DT == article.document_type
-    else:
-        with pytest.raises(AttributeError):
-            article.DT
-    if hasattr(article, "D2"):
-        assert article.D2 == article.book_digital_object_identifier
-    else:
-        with pytest.raises(AttributeError):
-            article.D2
-    if hasattr(article, "ED"):
-        assert article.ED == article.editors
-    else:
-        with pytest.raises(AttributeError):
-            article.ED
-    if hasattr(article, "EM"):
-        assert article.EM == article.email_address
-    else:
-        with pytest.raises(AttributeError):
-            article.EM
-    if hasattr(article, "EI"):
-        assert article.EI == article.eissn
-    else:
-        with pytest.raises(AttributeError):
-            article.EI
-    if hasattr(article, "EP"):
-        assert article.EP == article.ending_page
-    else:
-        with pytest.raises(AttributeError):
-            article.EP
-    if hasattr(article, "FU"):
-        assert article.FU == article.funding_agency_and_grant_number
-    else:
-        with pytest.raises(AttributeError):
-            article.FU
-    if hasattr(article, "FX"):
-        assert article.FX == article.funding_text
-    else:
-        with pytest.raises(AttributeError):
-            article.FX
-    if hasattr(article, "GA"):
-        assert article.GA == article.document_delivery_number
-    else:
-        with pytest.raises(AttributeError):
-            article.GA
-    if hasattr(article, "GP"):
-        assert article.GP == article.book_group_authors
-    else:
-        with pytest.raises(AttributeError):
-            article.GP
-    if hasattr(article, "HO"):
-        assert article.HO == article.conference_host
-    else:
-        with pytest.raises(AttributeError):
-            article.HO
-    if hasattr(article, "ID"):
-        assert article.ID == article.keywords_plus
-    else:
-        with pytest.raises(AttributeError):
-            article.ID
-    if hasattr(article, "ID"):
-        assert article.ID == article.keywords
-    else:
-        with pytest.raises(AttributeError):
-            article.ID
-    if hasattr(article, "IS"):
-        assert article.IS == article.issue
-    else:
-        with pytest.raises(AttributeError):
-            article.IS
-    if hasattr(article, "J9"):
-        assert article.J9 == article.source_abbreviation
-    else:
-        with pytest.raises(AttributeError):
-            article.J9
-    if hasattr(article, "JI"):
-        assert article.JI == article.iso_source_abbreviation
-    else:
-        with pytest.raises(AttributeError):
-            article.JI
-    if hasattr(article, "LA"):
-        assert article.LA == article.language
-    else:
-        with pytest.raises(AttributeError):
-            article.LA
-    if hasattr(article, "MA"):
-        assert article.MA == article.meeting_abstract
-    else:
-        with pytest.raises(AttributeError):
-            article.MA
-    if hasattr(article, "NR"):
-        assert article.NR == article.cited_reference_count
-    else:
-        with pytest.raises(AttributeError):
-            article.NR
-    if hasattr(article, "OI"):
-        assert article.OI == article.orcid_identifier
-    else:
-        with pytest.raises(AttributeError):
-            article.OI
-    if hasattr(article, "P2"):
-        assert article.P2 == article.chapter_count
-    else:
-        with pytest.raises(AttributeError):
-            article.P2
-    if hasattr(article, "PA"):
-        assert article.PA == article.publisher_address
-    else:
-        with pytest.raises(AttributeError):
-            article.PA
-    if hasattr(article, "PD"):
-        assert article.PD == article.publication_date
-    else:
-        with pytest.raises(AttributeError):
-            article.PD
-    if hasattr(article, "PG"):
-        assert article.PG == article.page_count
-    else:
-        with pytest.raises(AttributeError):
-            article.PG
-    if hasattr(article, "PI"):
-        assert article.PI == article.publisher_city
-    else:
-        with pytest.raises(AttributeError):
-            article.PI
-    if hasattr(article, "PM"):
-        assert article.PM == article.pubmed_id
-    else:
-        with pytest.raises(AttributeError):
-            article.PM
-    if hasattr(article, "PN"):
-        assert article.PN == article.part_number
-    else:
-        with pytest.raises(AttributeError):
-            article.PN
-    if hasattr(article, "PT"):
-        assert article.PT == article.publication_type
-    else:
-        with pytest.raises(AttributeError):
-            article.PT
-    if hasattr(article, "PU"):
-        assert article.PU == article.publisher
-    else:
-        with pytest.raises(AttributeError):
-            article.PU
-    if hasattr(article, "PY"):
-        assert article.PY == article.year_published
-    else:
-        with pytest.raises(AttributeError):
-            article.PY
-    if hasattr(article, "RI"):
-        assert article.RI == article.researcherid_number
-    else:
-        with pytest.raises(AttributeError):
-            article.RI
-    if hasattr(article, "RP"):
-        assert article.RP == article.reprint_address
-    else:
-        with pytest.raises(AttributeError):
-            article.RP
-    if hasattr(article, "SC"):
-        assert article.SC == article.research_areas
-    else:
-        with pytest.raises(AttributeError):
-            article.SC
-    if hasattr(article, "SE"):
-        assert article.SE == article.book_series_title
-    else:
-        with pytest.raises(AttributeError):
-            article.SE
-    if hasattr(article, "SI"):
-        assert article.SI == article.special_issue
-    else:
-        with pytest.raises(AttributeError):
-            article.SI
-    if hasattr(article, "SN"):
-        assert article.SN == article.issn
-    else:
-        with pytest.raises(AttributeError):
-            article.SN
-    if hasattr(article, "SP"):
-        assert article.SP == article.conference_sponsors
-    else:
-        with pytest.raises(AttributeError):
-            article.SP
-    if hasattr(article, "SU"):
-        assert article.SU == article.supplement
-    else:
-        with pytest.raises(AttributeError):
-            article.SU
-    if hasattr(article, "TC"):
-        assert article.TC == article.wos_times_cited_count
-    else:
-        with pytest.raises(AttributeError):
-            article.TC
-    if hasattr(article, "TC"):
-        assert article.TC == article.wos_times_cited
-    else:
-        with pytest.raises(AttributeError):
-            article.TC
-    if hasattr(article, "TI"):
-        assert article.TI == article.title
-    else:
-        with pytest.raises(AttributeError):
-            article.TI
-    if hasattr(article, "U1"):
-        assert article.U1 == article.usage_count
-    else:
-        with pytest.raises(AttributeError):
-            article.U1
-    if hasattr(article, "U2"):
-        assert article.U2 == article.usage_count
-    else:
-        with pytest.raises(AttributeError):
-            article.U2
-    if hasattr(article, "UT"):
-        assert article.UT == article.unique_article_identifier
-    else:
-        with pytest.raises(AttributeError):
-            article.UT
-    if hasattr(article, "VL"):
-        assert article.VL == article.volume
-    else:
-        with pytest.raises(AttributeError):
-            article.VL
-    if hasattr(article, "WC"):
-        assert article.WC == article.web_of_science_categories
-    else:
-        with pytest.raises(AttributeError):
-            article.WC
-    if hasattr(article, "Z9"):
-        assert article.Z9 == article.total_times_cited_count
-    else:
-        with pytest.raises(AttributeError):
-            article.Z9
-    if hasattr(article, "Z9"):
-        assert article.Z9 == article.times_cited
-    else:
-        with pytest.raises(AttributeError):
-            article.Z9
-
-
 def test_parsers(article):
-    assert article.PT == "J"
-    assert article.AU == ["Wodarz, S", "Hasegawa, T", "Ishio, S", "Homma, T"]
-    assert article.AF == [
+    assert article.extra["PT"] == "J"
+    assert article.authors == ["Wodarz, S", "Hasegawa, T", "Ishio, S", "Homma, T"]
+    assert article.extra["AF"] == [
         "Wodarz, Siggi",
         "Hasegawa, Takashi",
         "Ishio, Shunji",
         "Homma, Takayuki",
     ]
     assert (
-        article.TI
+        article.title
         == "Structural control of ultra-fine CoPt nanodot arrays via electrodeposition process"
     )
-    assert article.SO == "JOURNAL OF MAGNETISM AND MAGNETIC MATERIALS"
-    assert article.LA == "English"
-    assert article.DT == "Article"
-    assert article.DE == [
-        "Electrodeposition",
-        "Structural control",
-        "Nanodot array",
-        "Bit-patterned media",
-        "CoPt alloy",
-    ]
-    assert article.ID == [
-        "BIT-PATTERNED MEDIA",
-        "ELECTRON-BEAM LITHOGRAPHY",
-        "RECORDING MEDIA",
-        "MAGNETIC MEDIA",
-        "DENSITY",
-        "FILMS",
-        "ANISOTROPY",
-        "STORAGE",
-    ]
-    assert (
-        article.AB
-        == "CoPt nanodot arrays were fabricated by combining electrodeposition and electron beam lithography (EBL) for the use of bit-patterned media (BPM). To achieve precise control of deposition uniformity and coercivity of the CoPt nanodot arrays, their crystal structure and magnetic properties were controlled by controlling the diffusion state of metal ions from the initial deposition stage with the application of bath agitation. Following bath agitation, the composition gradient of the CoPt alloy with thickness was mitigated to have a near-ideal alloy composition of Co:Pt =80:20, which induces epitaxial-like growth from Ru substrate, thus resulting in the improvement of the crystal orientation of the hcp (002) structure from its initial deposition stages. Furthermore, the cross-sectional transmission electron microscope (TEM) analysis of the nanodots deposited with bath agitation showed CoPt growth along its c-axis oriented in the perpendicular direction, having uniform lattice fringes on the hcp (002) plane from the Ru underlayer interface, which is a significant factor to induce perpendicular magnetic anisotropy. Magnetic characterization of the CoPt nanodot arrays showed increase in the perpendicular coercivity and squareness of the hysteresis loops from 2.0 kOe and 0.64 (without agitation) to 4.0 kOe and 0.87 with bath agitation. Based on the detailed characterization of nanodot arrays, the precise crystal structure control of the nanodot arrays with ultra-high recording density by electrochemical process was successfully demonstrated."
-    )
-    assert article.C1 == [
-        "[Wodarz, Siggi; Homma, Takayuki] Waseda Univ, Dept Appl Chem, Shinjuku Ku, Tokyo 1698555, Japan.",
-        "[Hasegawa, Takashi; Ishio, Shunji] Akita Univ, Dept Mat Sci, Akita 0108502, Japan.",
-    ]
-    assert (
-        article.RP
-        == "Homma, T (reprint author), Waseda Univ, Dept Appl Chem, Shinjuku Ku, Tokyo 1698555, Japan."
-    )
-    assert article.EM == ["t.homma@waseda.jp"]
-    assert article.OI == ["Hasegawa, Takashi/0000-0002-8178-4980"]
-    assert article.FU == ["JSPS KAKENHI Grant [25249104]"]
-    assert (
-        article.FX
-        == "This work was supported in part by JSPS KAKENHI Grant Number 25249104."
-    )
-    assert article.CR == [
-        "Albrecht TR, 2013, IEEE T MAGN, V49, P773, DOI 10.1109/TMAG.2012.2227303",
-        "BUSCHOW KHJ, 1983, J MAGN MAGN MATER, V38, P1, DOI 10.1016/0304-8853(83)90097-5",
-        "Gapin AI, 2006, J APPL PHYS, V99, DOI 10.1063/1.2163289",
-        "Homma Takayuki, 2015, ECS Transactions, V64, P1, DOI 10.1149/06431.0001ecst",
-        "Kryder MH, 2008, P IEEE, V96, P1810, DOI 10.1109/JPROC.2008.2004315",
-        "Kubo T, 2005, J APPL PHYS, V97, DOI 10.1063/1.1855572",
-        "Lodder JC, 2004, J MAGN MAGN MATER, V272, P1692, DOI 10.1016/j.jmmm.2003.12.259",
-        "Mitsuzuka K, 2007, IEEE T MAGN, V43, P2160, DOI 10.1109/TMAG.2007.893129",
-        "Ouchi T, 2010, ELECTROCHIM ACTA, V55, P8081, DOI 10.1016/j.electacta.2010.02.073",
-        "Pattanaik G, 2006, J APPL PHYS, V99, DOI 10.1063/1.2150805",
-        "Pattanaik G, 2007, ELECTROCHIM ACTA, V52, P2755, DOI 10.1016/j.electacta.2006.07.062",
-        "Piramanayagam SN, 2009, J MAGN MAGN MATER, V321, P485, DOI 10.1016/j.jmmm.2008.05.007",
-        "Ross CA, 2008, MRS BULL, V33, P838, DOI 10.1557/mrs2008.179",
-        "Shiroishi Y, 2009, IEEE T MAGN, V45, P3816, DOI 10.1109/TMAG.2009.2024879",
-        "Sirtori V, 2011, ACS APPL MATER INTER, V3, P1800, DOI 10.1021/am200267u",
-        "Sohn JS, 2009, NANOTECHNOLOGY, V20, DOI 10.1088/0957-4484/20/2/025302",
-        "Sun SH, 2000, SCIENCE, V287, P1989, DOI 10.1126/science.287.5460.1989",
-        "Terris BD, 2007, MICROSYST TECHNOL, V13, P189, DOI 10.1007/s00542-006-0144-9",
-        "Wang JP, 2008, P IEEE, V96, P1847, DOI 10.1109/JPROC.2008.2004318",
-        "Weller D, 1999, IEEE T MAGN, V35, P4423, DOI 10.1109/20.809134",
-        "Weller D, 2000, IEEE T MAGN, V36, P10, DOI 10.1109/20.824418",
-        "Wodarz S, 2016, ELECTROCHIM ACTA, V197, P330, DOI 10.1016/j.electacta.2015.11.136",
-        "Xu X, 2012, J ELECTROCHEM SOC, V159, pD240, DOI 10.1149/2.090204jes",
-        "Yang X, 2007, J VAC SCI TECHNOL B, V25, P2202, DOI 10.1116/1.2798711",
-        "Yang XM, 2009, ACS NANO, V3, P1844, DOI 10.1021/nn900073r",
-        "Yasui N, 2003, APPL PHYS LETT, V83, P3347, DOI 10.1063/1.1622787",
-        "Yua H., 2009, J APPL PHYS, V105",
-        "Zhu JG, 2008, IEEE T MAGN, V44, P125, DOI 10.1109/TMAG.2007.911031",
-    ]
-    assert article.NR == 28
-    assert article.TC == 0
-    assert article.Z9 == 0
-    assert article.U1 == 21
-    assert article.U2 == 21
-    assert article.PU == "ELSEVIER SCIENCE BV"
-    assert article.PI == "AMSTERDAM"
-    assert article.PA == "PO BOX 211, 1000 AE AMSTERDAM, NETHERLANDS"
-    assert article.SN == "0304-8853"
-    assert article.EI == "1873-4766"
-    assert article.J9 == "J MAGN MAGN MATER"
-    assert article.JI == "J. Magn. Magn. Mater."
-    assert article.PD == "MAY 15"
-    assert article.PY == 2017
-    assert article.VL == "430"
-    assert article.BP == "52"
-    assert article.EP == "58"
-    assert article.DI == "10.1016/j.jmmm.2017.01.061"
-    assert article.PG == 7
-    assert article.WC == [
-        "Materials Science, Multidisciplinary",
-        "Physics, Condensed Matter",
-    ]
-    assert article.SC == ["Materials Science", "Physics"]
-    assert article.GA == "EP2GP"
-    assert article.UT == "WOS:000397201600008"
+    assert article.extra["SO"] == "JOURNAL OF MAGNETISM AND MAGNETIC MATERIALS"
 
 
 def test_article_attributes(article):
-    assert set(article.keys()) == {
-        "PT",
-        "AU",
-        "AF",
-        "TI",
-        "SO",
-        "LA",
-        "DT",
-        "DE",
-        "ID",
-        "AB",
-        "C1",
-        "RP",
-        "EM",
-        "OI",
-        "FU",
-        "FX",
-        "CR",
-        "NR",
-        "TC",
-        "Z9",
-        "U1",
-        "U2",
-        "PU",
-        "PI",
-        "PA",
-        "SN",
-        "EI",
-        "J9",
-        "JI",
-        "PD",
-        "PY",
-        "VL",
-        "BP",
-        "EP",
-        "DI",
-        "PG",
-        "WC",
-        "SC",
-        "GA",
-        "UT",
-    }
-
-
-def test_article_raw_data(article):
-    raw_data = article.raw_data
-    assert "ER" not in raw_data
-    assert raw_data["PT"] == ["J"]
-    assert raw_data["AU"] == ["Wodarz, S", "Hasegawa, T", "Ishio, S", "Homma, T"]
-    assert raw_data["AF"] == [
-        "Wodarz, Siggi",
-        "Hasegawa, Takashi",
-        "Ishio, Shunji",
-        "Homma, Takayuki",
-    ]
-    assert raw_data["TI"] == [
-        "Structural control of ultra-fine CoPt nanodot arrays via",
-        "electrodeposition process",
-    ]
-    assert raw_data["SO"] == ["JOURNAL OF MAGNETISM AND MAGNETIC MATERIALS"]
-    assert raw_data["LA"] == ["English"]
-    assert raw_data["DT"] == ["Article"]
-    assert raw_data["DE"] == [
-        "Electrodeposition; Structural control; Nanodot array; Bit-patterned",
-        "media; CoPt alloy",
-    ]
-    assert raw_data["ID"] == [
-        "BIT-PATTERNED MEDIA; ELECTRON-BEAM LITHOGRAPHY; RECORDING MEDIA;",
-        "MAGNETIC MEDIA; DENSITY; FILMS; ANISOTROPY; STORAGE",
-    ]
-    assert raw_data["AB"] == [
-        "CoPt nanodot arrays were fabricated by combining electrodeposition and electron beam lithography (EBL) for the use of bit-patterned media (BPM). To achieve precise control of deposition uniformity and coercivity of the CoPt nanodot arrays, their crystal structure and magnetic properties were controlled by controlling the diffusion state of metal ions from the initial deposition stage with the application of bath agitation. Following bath agitation, the composition gradient of the CoPt alloy with thickness was mitigated to have a near-ideal alloy composition of Co:Pt =80:20, which induces epitaxial-like growth from Ru substrate, thus resulting in the improvement of the crystal orientation of the hcp (002) structure from its initial deposition stages. Furthermore, the cross-sectional transmission electron microscope (TEM) analysis of the nanodots deposited with bath agitation showed CoPt growth along its c-axis oriented in the perpendicular direction, having uniform lattice fringes on the hcp (002) plane from the Ru underlayer interface, which is a significant factor to induce perpendicular magnetic anisotropy. Magnetic characterization of the CoPt nanodot arrays showed increase in the perpendicular coercivity and squareness of the hysteresis loops from 2.0 kOe and 0.64 (without agitation) to 4.0 kOe and 0.87 with bath agitation. Based on the detailed characterization of nanodot arrays, the precise crystal structure control of the nanodot arrays with ultra-high recording density by electrochemical process was successfully demonstrated."
-    ]
-    assert raw_data["C1"] == [
-        "[Wodarz, Siggi; Homma, Takayuki] Waseda Univ, Dept Appl Chem, Shinjuku Ku, Tokyo 1698555, Japan.",
-        "[Hasegawa, Takashi; Ishio, Shunji] Akita Univ, Dept Mat Sci, Akita 0108502, Japan.",
-    ]
-    assert raw_data["RP"] == [
-        "Homma, T (reprint author), Waseda Univ, Dept Appl Chem, Shinjuku Ku, Tokyo 1698555, Japan."
-    ]
-    assert raw_data["EM"] == ["t.homma@waseda.jp"]
-    assert raw_data["OI"] == ["Hasegawa, Takashi/0000-0002-8178-4980"]
-    assert raw_data["FU"] == ["JSPS KAKENHI Grant [25249104]"]
-    assert raw_data["FX"] == [
-        "This work was supported in part by JSPS KAKENHI Grant Number 25249104."
-    ]
-    assert raw_data["CR"] == [
-        "Albrecht TR, 2013, IEEE T MAGN, V49, P773, DOI 10.1109/TMAG.2012.2227303",
-        "BUSCHOW KHJ, 1983, J MAGN MAGN MATER, V38, P1, DOI 10.1016/0304-8853(83)90097-5",
-        "Gapin AI, 2006, J APPL PHYS, V99, DOI 10.1063/1.2163289",
-        "Homma Takayuki, 2015, ECS Transactions, V64, P1, DOI 10.1149/06431.0001ecst",
-        "Kryder MH, 2008, P IEEE, V96, P1810, DOI 10.1109/JPROC.2008.2004315",
-        "Kubo T, 2005, J APPL PHYS, V97, DOI 10.1063/1.1855572",
-        "Lodder JC, 2004, J MAGN MAGN MATER, V272, P1692, DOI 10.1016/j.jmmm.2003.12.259",
-        "Mitsuzuka K, 2007, IEEE T MAGN, V43, P2160, DOI 10.1109/TMAG.2007.893129",
-        "Ouchi T, 2010, ELECTROCHIM ACTA, V55, P8081, DOI 10.1016/j.electacta.2010.02.073",
-        "Pattanaik G, 2006, J APPL PHYS, V99, DOI 10.1063/1.2150805",
-        "Pattanaik G, 2007, ELECTROCHIM ACTA, V52, P2755, DOI 10.1016/j.electacta.2006.07.062",
-        "Piramanayagam SN, 2009, J MAGN MAGN MATER, V321, P485, DOI 10.1016/j.jmmm.2008.05.007",
-        "Ross CA, 2008, MRS BULL, V33, P838, DOI 10.1557/mrs2008.179",
-        "Shiroishi Y, 2009, IEEE T MAGN, V45, P3816, DOI 10.1109/TMAG.2009.2024879",
-        "Sirtori V, 2011, ACS APPL MATER INTER, V3, P1800, DOI 10.1021/am200267u",
-        "Sohn JS, 2009, NANOTECHNOLOGY, V20, DOI 10.1088/0957-4484/20/2/025302",
-        "Sun SH, 2000, SCIENCE, V287, P1989, DOI 10.1126/science.287.5460.1989",
-        "Terris BD, 2007, MICROSYST TECHNOL, V13, P189, DOI 10.1007/s00542-006-0144-9",
-        "Wang JP, 2008, P IEEE, V96, P1847, DOI 10.1109/JPROC.2008.2004318",
-        "Weller D, 1999, IEEE T MAGN, V35, P4423, DOI 10.1109/20.809134",
-        "Weller D, 2000, IEEE T MAGN, V36, P10, DOI 10.1109/20.824418",
-        "Wodarz S, 2016, ELECTROCHIM ACTA, V197, P330, DOI 10.1016/j.electacta.2015.11.136",
-        "Xu X, 2012, J ELECTROCHEM SOC, V159, pD240, DOI 10.1149/2.090204jes",
-        "Yang X, 2007, J VAC SCI TECHNOL B, V25, P2202, DOI 10.1116/1.2798711",
-        "Yang XM, 2009, ACS NANO, V3, P1844, DOI 10.1021/nn900073r",
-        "Yasui N, 2003, APPL PHYS LETT, V83, P3347, DOI 10.1063/1.1622787",
-        "Yua H., 2009, J APPL PHYS, V105",
-        "Zhu JG, 2008, IEEE T MAGN, V44, P125, DOI 10.1109/TMAG.2007.911031",
-    ]
-    assert raw_data["NR"] == ["28"]
-    assert raw_data["TC"] == ["0"]
-    assert raw_data["Z9"] == ["0"]
-    assert raw_data["U1"] == ["21"]
-    assert raw_data["U2"] == ["21"]
-    assert raw_data["PU"] == ["ELSEVIER SCIENCE BV"]
-    assert raw_data["PI"] == ["AMSTERDAM"]
-    assert raw_data["PA"] == ["PO BOX 211, 1000 AE AMSTERDAM, NETHERLANDS"]
-    assert raw_data["SN"] == ["0304-8853"]
-    assert raw_data["EI"] == ["1873-4766"]
-    assert raw_data["J9"] == ["J MAGN MAGN MATER"]
-    assert raw_data["JI"] == ["J. Magn. Magn. Mater."]
-    assert raw_data["PD"] == ["MAY 15"]
-    assert raw_data["PY"] == ["2017"]
-    assert raw_data["VL"] == ["430"]
-    assert raw_data["BP"] == ["52"]
-    assert raw_data["EP"] == ["58"]
-    assert raw_data["DI"] == ["10.1016/j.jmmm.2017.01.061"]
-    assert raw_data["PG"] == ["7"]
-    assert raw_data["WC"] == [
-        "Materials Science, Multidisciplinary; Physics, Condensed Matter"
-    ]
-    assert raw_data["SC"] == ["Materials Science; Physics"]
-    assert raw_data["GA"] == ["EP2GP"]
-    assert raw_data["UT"] == ["WOS:000397201600008"]
+    assert set(article.extra.keys()).issuperset(
+        {
+            "PT",
+            "AU",
+            "AF",
+            "TI",
+            "SO",
+            "LA",
+            "DT",
+            "DE",
+            "ID",
+            "AB",
+            "C1",
+            "RP",
+            "EM",
+            "OI",
+            "FU",
+            "FX",
+            "CR",
+            "NR",
+            "TC",
+            "Z9",
+            "U1",
+            "U2",
+            "PU",
+            "PI",
+            "PA",
+            "SN",
+            "EI",
+            "J9",
+            "JI",
+            "PD",
+            "PY",
+            "VL",
+            "BP",
+            "EP",
+            "DI",
+            "PG",
+            "WC",
+            "SC",
+            "GA",
+            "UT",
+        }
+    )
 
 
-def test_article_data(article):
-    data = article.data
+def test_article_extra(article):
+    data = article.extra
     assert data.get("AB") == data.get("abstract")
     assert data.get("AF") == data.get("author_full_names")
     assert data.get("AR") == data.get("article_number")
@@ -688,9 +154,7 @@ def test_article_data(article):
 
 
 def test_article_properties(article):
-    assert isinstance(article.text, str)
-    assert isinstance(article.raw_data, dict)
-    assert isinstance(article.data, dict)
+    assert isinstance(article.extra, dict)
 
 
 def test_collection_from_filenames(collection_many_documents):
@@ -708,7 +172,7 @@ def test_collection_from_glob():
     for article in collection.articles:
         assert isinstance(article, Article)
 
-    assert len(list(collection.articles)) == 500
+    assert len(list(collection.articles)) == 13892
 
     for file in collection.files:
         assert hasattr(file, "read")
@@ -735,252 +199,31 @@ def test_collection_with_duplicated(filename_single_document, filename_many_docu
         filename_single_document, filename_single_document, filename_single_document
     )
     assert len(list(collection.files)) == 3
-    assert len(list(collection.articles)) == 1
+    assert len(list(collection.articles)) == 87
 
     collection = CollectionLazy.from_filenames(
         filename_many_documents, filename_many_documents, filename_many_documents
     )
     assert len(list(collection.files)) == 3
-    assert len(list(collection.articles)) == 500
+    assert len(list(collection.articles)) == 41589
 
 
 def test_collection_authors(collection_single_document):
     authors = collection_single_document.authors
-    assert next(authors) == "Wodarz, Siggi"
-    assert next(authors) == "Hasegawa, Takashi"
-    assert next(authors) == "Ishio, Shunji"
-    assert next(authors) == "Homma, Takayuki"
+    assert next(authors) == "Wodarz, S"
+    assert next(authors) == "Hasegawa, T"
+    assert next(authors) == "Ishio, S"
+    assert next(authors) == "Homma, T"
 
 
 def test_collection_coauthors(collection_single_document):
     coauthors = collection_single_document.coauthors
-    assert next(coauthors) == ("Hasegawa, Takashi", "Homma, Takayuki")
-    assert next(coauthors) == ("Hasegawa, Takashi", "Ishio, Shunji")
-    assert next(coauthors) == ("Hasegawa, Takashi", "Wodarz, Siggi")
-    assert next(coauthors) == ("Homma, Takayuki", "Ishio, Shunji")
-    assert next(coauthors) == ("Homma, Takayuki", "Wodarz, Siggi")
-    assert next(coauthors) == ("Ishio, Shunji", "Wodarz, Siggi")
-
-
-def test_collection_completeness_single_article(collection_single_document):
-    assert collection_single_document.completeness() == {
-        "PT": 1,
-        "AU": 1,
-        "AF": 1,
-        "TI": 1,
-        "SO": 1,
-        "LA": 1,
-        "DT": 1,
-        "DE": 1,
-        "ID": 1,
-        "AB": 1,
-        "C1": 1,
-        "RP": 1,
-        "EM": 1,
-        "OI": 1,
-        "FU": 1,
-        "FX": 1,
-        "CR": 1,
-        "NR": 1,
-        "TC": 1,
-        "Z9": 1,
-        "U1": 1,
-        "U2": 1,
-        "PU": 1,
-        "PI": 1,
-        "PA": 1,
-        "SN": 1,
-        "EI": 1,
-        "J9": 1,
-        "JI": 1,
-        "PD": 1,
-        "PY": 1,
-        "VL": 1,
-        "BP": 1,
-        "EP": 1,
-        "DI": 1,
-        "PG": 1,
-        "WC": 1,
-        "SC": 1,
-        "GA": 1,
-        "UT": 1,
-    }
-
-
-def test_collection_completeness_many_articles(collection_many_documents):
-    assert collection_many_documents.completeness() == {
-        "AB": 497 / 500,
-        "AF": 500 / 500,
-        "AR": 216 / 500,
-        "AU": 500 / 500,
-        "BP": 281 / 500,
-        "C1": 500 / 500,
-        "CL": 152 / 500,
-        "CR": 500 / 500,
-        "CT": 152 / 500,
-        "CY": 152 / 500,
-        "DE": 336 / 500,
-        "DI": 486 / 500,
-        "DT": 500 / 500,
-        "EI": 262 / 500,
-        "EM": 469 / 500,
-        "EP": 281 / 500,
-        "FU": 270 / 500,
-        "FX": 270 / 500,
-        "GA": 500 / 500,
-        "HO": 24 / 500,
-        "ID": 440 / 500,
-        "IS": 458 / 500,
-        "J9": 500 / 500,
-        "JI": 500 / 500,
-        "LA": 500 / 500,
-        "NR": 500 / 500,
-        "OI": 168 / 500,
-        "PA": 500 / 500,
-        "PD": 469 / 500,
-        "PG": 500 / 500,
-        "PI": 500 / 500,
-        "PM": 60 / 500,
-        "PN": 60 / 500,
-        "PT": 500 / 500,
-        "PU": 500 / 500,
-        "PY": 500 / 500,
-        "RI": 172 / 500,
-        "RP": 498 / 500,
-        "SC": 500 / 500,
-        "SI": 23 / 500,
-        "SN": 500 / 500,
-        "SO": 500 / 500,
-        "SP": 88 / 500,
-        "SU": 2 / 500,
-        "TC": 500 / 500,
-        "TI": 500 / 500,
-        "U1": 500 / 500,
-        "U2": 500 / 500,
-        "UT": 500 / 500,
-        "VL": 495 / 500,
-        "WC": 500 / 500,
-        "Z9": 500 / 500,
-    }
-
-
-def test_collection_citation_pairs(collection_single_document):
-    pairs = [
-        (
-            "Wodarz S, 2017, J MAGN MAGN MATER, V430, P52, DOI 10.1016/j.jmmm.2017.01.061",
-            "Albrecht TR, 2013, IEEE T MAGN, V49, P773, DOI 10.1109/TMAG.2012.2227303",
-        ),
-        (
-            "Wodarz S, 2017, J MAGN MAGN MATER, V430, P52, DOI 10.1016/j.jmmm.2017.01.061",
-            "BUSCHOW KHJ, 1983, J MAGN MAGN MATER, V38, P1, DOI 10.1016/0304-8853(83)90097-5",
-        ),
-        (
-            "Wodarz S, 2017, J MAGN MAGN MATER, V430, P52, DOI 10.1016/j.jmmm.2017.01.061",
-            "Gapin AI, 2006, J APPL PHYS, V99, DOI 10.1063/1.2163289",
-        ),
-        (
-            "Wodarz S, 2017, J MAGN MAGN MATER, V430, P52, DOI 10.1016/j.jmmm.2017.01.061",
-            "Homma Takayuki, 2015, ECS Transactions, V64, P1, DOI 10.1149/06431.0001ecst",
-        ),
-        (
-            "Wodarz S, 2017, J MAGN MAGN MATER, V430, P52, DOI 10.1016/j.jmmm.2017.01.061",
-            "Kryder MH, 2008, P IEEE, V96, P1810, DOI 10.1109/JPROC.2008.2004315",
-        ),
-        (
-            "Wodarz S, 2017, J MAGN MAGN MATER, V430, P52, DOI 10.1016/j.jmmm.2017.01.061",
-            "Kubo T, 2005, J APPL PHYS, V97, DOI 10.1063/1.1855572",
-        ),
-        (
-            "Wodarz S, 2017, J MAGN MAGN MATER, V430, P52, DOI 10.1016/j.jmmm.2017.01.061",
-            "Lodder JC, 2004, J MAGN MAGN MATER, V272, P1692, DOI 10.1016/j.jmmm.2003.12.259",
-        ),
-        (
-            "Wodarz S, 2017, J MAGN MAGN MATER, V430, P52, DOI 10.1016/j.jmmm.2017.01.061",
-            "Mitsuzuka K, 2007, IEEE T MAGN, V43, P2160, DOI 10.1109/TMAG.2007.893129",
-        ),
-        (
-            "Wodarz S, 2017, J MAGN MAGN MATER, V430, P52, DOI 10.1016/j.jmmm.2017.01.061",
-            "Ouchi T, 2010, ELECTROCHIM ACTA, V55, P8081, DOI 10.1016/j.electacta.2010.02.073",
-        ),
-        (
-            "Wodarz S, 2017, J MAGN MAGN MATER, V430, P52, DOI 10.1016/j.jmmm.2017.01.061",
-            "Pattanaik G, 2006, J APPL PHYS, V99, DOI 10.1063/1.2150805",
-        ),
-        (
-            "Wodarz S, 2017, J MAGN MAGN MATER, V430, P52, DOI 10.1016/j.jmmm.2017.01.061",
-            "Pattanaik G, 2007, ELECTROCHIM ACTA, V52, P2755, DOI 10.1016/j.electacta.2006.07.062",
-        ),
-        (
-            "Wodarz S, 2017, J MAGN MAGN MATER, V430, P52, DOI 10.1016/j.jmmm.2017.01.061",
-            "Piramanayagam SN, 2009, J MAGN MAGN MATER, V321, P485, DOI 10.1016/j.jmmm.2008.05.007",
-        ),
-        (
-            "Wodarz S, 2017, J MAGN MAGN MATER, V430, P52, DOI 10.1016/j.jmmm.2017.01.061",
-            "Ross CA, 2008, MRS BULL, V33, P838, DOI 10.1557/mrs2008.179",
-        ),
-        (
-            "Wodarz S, 2017, J MAGN MAGN MATER, V430, P52, DOI 10.1016/j.jmmm.2017.01.061",
-            "Shiroishi Y, 2009, IEEE T MAGN, V45, P3816, DOI 10.1109/TMAG.2009.2024879",
-        ),
-        (
-            "Wodarz S, 2017, J MAGN MAGN MATER, V430, P52, DOI 10.1016/j.jmmm.2017.01.061",
-            "Sirtori V, 2011, ACS APPL MATER INTER, V3, P1800, DOI 10.1021/am200267u",
-        ),
-        (
-            "Wodarz S, 2017, J MAGN MAGN MATER, V430, P52, DOI 10.1016/j.jmmm.2017.01.061",
-            "Sohn JS, 2009, NANOTECHNOLOGY, V20, DOI 10.1088/0957-4484/20/2/025302",
-        ),
-        (
-            "Wodarz S, 2017, J MAGN MAGN MATER, V430, P52, DOI 10.1016/j.jmmm.2017.01.061",
-            "Sun SH, 2000, SCIENCE, V287, P1989, DOI 10.1126/science.287.5460.1989",
-        ),
-        (
-            "Wodarz S, 2017, J MAGN MAGN MATER, V430, P52, DOI 10.1016/j.jmmm.2017.01.061",
-            "Terris BD, 2007, MICROSYST TECHNOL, V13, P189, DOI 10.1007/s00542-006-0144-9",
-        ),
-        (
-            "Wodarz S, 2017, J MAGN MAGN MATER, V430, P52, DOI 10.1016/j.jmmm.2017.01.061",
-            "Wang JP, 2008, P IEEE, V96, P1847, DOI 10.1109/JPROC.2008.2004318",
-        ),
-        (
-            "Wodarz S, 2017, J MAGN MAGN MATER, V430, P52, DOI 10.1016/j.jmmm.2017.01.061",
-            "Weller D, 1999, IEEE T MAGN, V35, P4423, DOI 10.1109/20.809134",
-        ),
-        (
-            "Wodarz S, 2017, J MAGN MAGN MATER, V430, P52, DOI 10.1016/j.jmmm.2017.01.061",
-            "Weller D, 2000, IEEE T MAGN, V36, P10, DOI 10.1109/20.824418",
-        ),
-        (
-            "Wodarz S, 2017, J MAGN MAGN MATER, V430, P52, DOI 10.1016/j.jmmm.2017.01.061",
-            "Wodarz S, 2016, ELECTROCHIM ACTA, V197, P330, DOI 10.1016/j.electacta.2015.11.136",
-        ),
-        (
-            "Wodarz S, 2017, J MAGN MAGN MATER, V430, P52, DOI 10.1016/j.jmmm.2017.01.061",
-            "Xu X, 2012, J ELECTROCHEM SOC, V159, pD240, DOI 10.1149/2.090204jes",
-        ),
-        (
-            "Wodarz S, 2017, J MAGN MAGN MATER, V430, P52, DOI 10.1016/j.jmmm.2017.01.061",
-            "Yang X, 2007, J VAC SCI TECHNOL B, V25, P2202, DOI 10.1116/1.2798711",
-        ),
-        (
-            "Wodarz S, 2017, J MAGN MAGN MATER, V430, P52, DOI 10.1016/j.jmmm.2017.01.061",
-            "Yang XM, 2009, ACS NANO, V3, P1844, DOI 10.1021/nn900073r",
-        ),
-        (
-            "Wodarz S, 2017, J MAGN MAGN MATER, V430, P52, DOI 10.1016/j.jmmm.2017.01.061",
-            "Yasui N, 2003, APPL PHYS LETT, V83, P3347, DOI 10.1063/1.1622787",
-        ),
-        (
-            "Wodarz S, 2017, J MAGN MAGN MATER, V430, P52, DOI 10.1016/j.jmmm.2017.01.061",
-            "Yua H., 2009, J APPL PHYS, V105",
-        ),
-        (
-            "Wodarz S, 2017, J MAGN MAGN MATER, V430, P52, DOI 10.1016/j.jmmm.2017.01.061",
-            "Zhu JG, 2008, IEEE T MAGN, V44, P125, DOI 10.1109/TMAG.2007.911031",
-        ),
-    ]
-
-    assert list(collection_single_document.citation_pairs()) == pairs
+    assert next(coauthors) == ("Hasegawa, T", "Homma, T")
+    assert next(coauthors) == ("Hasegawa, T", "Ishio, S")
+    assert next(coauthors) == ("Hasegawa, T", "Wodarz, S")
+    assert next(coauthors) == ("Homma, T", "Ishio, S")
+    assert next(coauthors) == ("Homma, T", "Wodarz, S")
+    assert next(coauthors) == ("Ishio, S", "Wodarz, S")
 
 
 def test_command_line_interface():
diff --git a/wostools/__init__.py b/wostools/__init__.py
index e73dcc4..47315ea 100644
--- a/wostools/__init__.py
+++ b/wostools/__init__.py
@@ -4,6 +4,7 @@
 __email__ = "dev@coreofscience.com"
 __version__ = "1.1.0"
 
-from wostools.wostools import CollectionLazy, WosToolsError, Article
+from wostools.article import Article
+from wostools.lazy import CollectionLazy
 
-__all__ = ["CollectionLazy", "WosToolsError", "Article"]
+__all__ = ["CollectionLazy", "Article"]
diff --git a/wostools/article.py b/wostools/article.py
new file mode 100644
index 0000000..42b2426
--- /dev/null
+++ b/wostools/article.py
@@ -0,0 +1,125 @@
+import logging
+import re
+import collections
+from typing import List, Optional, Mapping, TypeVar, Any
+
+from wostools.fields import parse_all
+
+logger = logging.getLogger(__name__)
+
+# The null part accounts for an ISI wok bug
+ISI_LINE_PATTERN = re.compile(r"^(null)?((?P<field>[A-Z0-9]{2})|  )( (?P<value>.*))?$")
+
+ISI_CITATION_PATTERN = re.compile(
+    r"""^(?P<AU>[^,]+)?,[ ]         # First author
+        (?P<PY>\d{4})?,[ ]          # Publication year
+        (?P<J9>[^,]+)?              # Journal
+        (,[ ]V(?P<VL>[\w\d-]+))?    # Volume
+        (,[ ][Pp](?P<BP>\d+))?      # Start page
+        (,[ ]DOI[ ](?P<DI>.+))?     # The all important DOI
+        """,
+    re.X,
+)
+
+
+class Article(object):
+    def __init__(
+        self,
+        title: Optional[str],
+        authors: List[str],
+        year: Optional[int],
+        journal: Optional[str],
+        volume: Optional[str] = None,
+        page: Optional[str] = None,
+        doi: Optional[str] = None,
+        references: Optional[List[str]] = None,
+        sources: Optional[List[str]] = None,
+        extra: Optional[Mapping] = None,
+    ):
+        self.title: Optional[str] = title
+        self.authors: List[str] = authors
+        self.year: Optional[int] = year
+        self.journal: Optional[str] = journal
+        self.volume: Optional[str] = volume
+        self.page: Optional[str] = page
+        self.doi: Optional[str] = doi
+        self.references: List[str] = references or []
+        self.sources: List[str] = sources or []
+        self.extra: Mapping[str, Any] = extra or {}
+
+    @property
+    def label(self):
+        if not (self.authors and self.year and self.journal):
+            raise ValueError("Missing required fields for label")
+        pieces = {
+            "AU": self.authors[0].replace(",", ""),
+            "PY": str(self.year),
+            "J9": str(self.journal),
+            "VL": f"V{self.volume}" if self.volume else None,
+            "BP": f"P{self.page}" if self.page else None,
+            "DI": f"DOI {self.doi}" if self.doi else None,
+        }
+        return ", ".join(value for value in pieces.values() if value)
+
+    def merge(self, other: "Article") -> "Article":
+        if self.label != other.label:
+            logger.warning(
+                "Mixing articles with different labels might result in tragedy"
+            )
+        return Article(
+            title=self.title or other.title,
+            authors=list(set(self.authors).union(set(other.authors))),
+            year=self.year or other.year,
+            journal=self.journal or other.journal,
+            volume=self.volume or other.volume,
+            page=self.page or other.page,
+            doi=self.doi or other.doi,
+            sources=[*self.sources, *other.sources],
+            extra={**self.extra, **other.extra},
+        )
+
+    @classmethod
+    def from_isi_text(cls, raw: str) -> "Article":
+        data = collections.defaultdict(list)
+        field = None
+        for line in raw.split("\n"):
+            match = ISI_LINE_PATTERN.match(line)
+            if not match:
+                raise ValueError(f"'{line}' is not a valid ISI file line")
+            parsed = match.groupdict()
+            field = parsed.get("field") or field
+            if not field or "value" not in parsed or parsed["value"] is None:
+                continue
+            data[field].append(parsed["value"])
+        processed = parse_all(dict(data))
+        return cls(
+            title=processed.get("title"),
+            authors=processed.get("authors", []),
+            year=processed.get("year"),
+            journal=processed.get("source_abbreviation"),
+            volume=processed.get("volume"),
+            page=processed.get("beginning_page"),
+            doi=processed.get("DOI"),
+            references=processed.get("references"),
+            extra=processed,
+            sources=[raw],
+        )
+
+    @classmethod
+    def from_isi_citation(cls, citation: str) -> "Article":
+        match = ISI_CITATION_PATTERN.match(citation)
+        if not match:
+            raise ValueError(f"{citation} does not look like an ISI citation")
+        data = {key: [value] for key, value in match.groupdict().items() if value}
+        processed = parse_all(data)
+        return cls(
+            title=processed.get("title"),
+            authors=processed.get("authors", []),
+            year=processed.get("year"),
+            journal=processed.get("source_abbreviation"),
+            volume=processed.get("volume"),
+            page=processed.get("beginning_page"),
+            doi=processed.get("DOI"),
+            extra=processed,
+            sources=[citation],
+        )
diff --git a/wostools/cli.py b/wostools/cli.py
index e7de372..d54edd3 100644
--- a/wostools/cli.py
+++ b/wostools/cli.py
@@ -3,7 +3,6 @@
 import click
 
 from wostools import CollectionLazy
-from wostools.fields import field_aliases, field_keys
 
 
 @click.group()
@@ -32,52 +31,8 @@ def citation_pairs(sources, output):
         return
 
     collection = CollectionLazy.from_filenames(*[f.name for f in sources])
-    pairs = list(collection.citation_pairs())
+    pairs = [
+        (source.label, target.label) for source, target in collection.citation_pairs()
+    ]
 
     json.dump(pairs, output, indent=2)
-
-
-@main.command("to-json")
-@click.argument("sources", type=click.File("r"), nargs=-1)
-@click.option(
-    "--output",
-    type=click.File("w"),
-    show_default=True,
-    default="-",
-    help="File to save json otuput.",
-)
-@click.option(
-    "--raw",
-    default=False,
-    is_flag=True,
-    show_default=True,
-    help="Flag; If true, the fields are the field tags; If false, the fields are the aliases.",
-)
-def to_json(sources, output, raw):
-    """
-    Build a collection by using the sources and print the entries converted to
-    to json format or dumps them in the `output`.
-    """
-    if not len(sources) > 0:
-        click.secho("You should give at least a file with documents.", fg="red")
-        return
-
-    collection = CollectionLazy.from_filenames(*[f.name for f in sources])
-    length = len(collection)
-    output.write("[\n")
-    for i, article in enumerate(collection.articles):
-        fields = field_keys() if raw else field_aliases()
-
-        text = json.dumps(
-            {field: article.data[field] for field in fields if field in article},
-            indent=2,
-        )
-        text = "  " + "\n  ".join(text.split("\n"))
-
-        output.write(text)
-
-        if i + 1 < length:
-            output.write(",\n")
-        else:
-            output.write("\n")
-    output.write("]")
diff --git a/wostools/fields.py b/wostools/fields.py
index 0e7c21c..e8b83d9 100644
--- a/wostools/fields.py
+++ b/wostools/fields.py
@@ -4,7 +4,7 @@
 
 import collections
 import functools
-
+from typing import Any, Dict, List, Mapping
 
 IsiField = collections.namedtuple(
     "IsiField", ["key", "description", "parse", "aliases"]
@@ -16,7 +16,7 @@ def joined(seq, sep=" "):
 
 
 def ident(seq):
-    return list(s.strip() for s in seq)
+    return [s.strip() for s in seq]
 
 
 def delimited(seq, delimiter="; "):
@@ -24,7 +24,14 @@ def delimited(seq, delimiter="; "):
 
 
 def integer(seq):
-    return int(joined(seq).strip())
+    if len(seq) > 1:
+        raise ValueError(f"Expected no more than one item and got {seq}")
+    (first,) = seq
+    return int(first.strip())
+
+
+def unknown(key) -> IsiField:
+    return IsiField(key, key, joined, [])
 
 
 FIELDS = {
@@ -58,7 +65,10 @@ def integer(seq):
     "CY": IsiField("CY", "Conference Date", joined, ["conference_date"]),
     "DE": IsiField("DE", "Author Keywords", delimited, ["author_keywords"]),
     "DI": IsiField(
-        "DI", "Digital Object Identifier (DOI)", joined, ["digital_object_identifier"]
+        "DI",
+        "Digital Object Identifier (DOI)",
+        joined,
+        ["digital_object_identifier", "DOI"],
     ),
     "DT": IsiField("DT", "Document Type", joined, ["document_type"]),
     "D2": IsiField(
@@ -126,7 +136,9 @@ def integer(seq):
         ["publication_type"],
     ),
     "PU": IsiField("PU", "Publisher", joined, ["publisher"]),
-    "PY": IsiField("PY", "Year Published", integer, ["year_published"]),
+    "PY": IsiField(
+        "PY", "Year Published", integer, ["year_published", "year", "publication_year"]
+    ),
     "RI": IsiField("RI", "ResearcherID Number", delimited, ["researcherid_number"]),
     "RP": IsiField("RP", "Reprint Address", joined, ["reprint_address"]),
     "SC": IsiField("SC", "Research Areas", delimited, ["research_areas"]),
@@ -168,17 +180,31 @@ def integer(seq):
 }
 
 
-def field_aliases():
-    for fields in FIELDS.values():
-        yield fields.aliases[-1]
+def parse(key: str, value: List) -> Dict:
+    if key in {"FN", "VR"}:
+        # This disregards headers
+        return {}
+    if key not in FIELDS:
+        raise ValueError(f"{key} is not a known ISI field.")
+    try:
+        field = FIELDS[key]
+        parsed = field.parse(value)
+        return {k: parsed for k in [key, *field.aliases]}
+    except ValueError as e:
+        raise ValueError(f"Field {key}: {e}")
+    except AttributeError as e:
+        raise AttributeError(f"Field {key}: {e}")
 
 
-def field_keys():
-    for fields in FIELDS.values():
-        yield fields.key
+def alias(raw: Dict) -> Dict:
+    output: Dict[str, Any] = {}
+    for key, value in output:
+        field = FIELDS.get(key, unknown(key))
+        output.update({k: value for k in [key, *field.aliases]})
+    return output
 
 
-def preprocess(raw_dict):
+def parse_all(raw_dict: Dict[str, List[str]]) -> Mapping[str, Any]:
     """Preprocesses a dictionary, with information about WoS field tags and its
         value according to a article, with some parser functions that depends on
         the field tag. If there is no a CR field, it adds one to the output with
@@ -200,12 +226,5 @@ def preprocess(raw_dict):
     processed_data = {}
     raw_dict.setdefault("CR", [])
     for key, seq in raw_dict.items():
-        if key in FIELDS:
-            field = FIELDS[key]
-            parsed = field.parse(seq)
-            processed_data[key] = parsed
-            for alias in field.aliases:
-                processed_data[alias] = parsed
-        else:
-            processed_data[key] = " ".join(seq)
+        processed_data.update(parse(key, seq))
     return processed_data
diff --git a/wostools/lazy.py b/wostools/lazy.py
new file mode 100644
index 0000000..4604b0e
--- /dev/null
+++ b/wostools/lazy.py
@@ -0,0 +1,145 @@
+"""
+The whole wostools thing.
+"""
+
+import collections
+import glob
+import itertools
+import logging
+import re
+from typing import Callable, Dict, Iterable, Optional, Tuple, TypeVar, Union
+
+from wostools.article import Article
+
+logger = logging.getLogger(__name__)
+
+
+class CollectionLazy(object):
+    """A collection of WOS text files.
+
+    Args:
+        *filenames (str): Strings with the names of the files containing
+            articles.
+    """
+
+    def __init__(self, *files):
+        self._files = files
+        for file in self._files:
+            file.seek(0)
+
+    @classmethod
+    def from_glob(cls, pattern):
+        """Creates a new collection from a pattern using glob.
+
+        Args:
+            pattern (str): String with the pattern to be passed to glob.
+
+        Returns:
+            CollectionLazy: Collection with the articles by using the pattern.
+        """
+        return cls.from_filenames(*glob.glob(pattern))
+
+    @classmethod
+    def from_filenames(cls, *filenames):
+        """Creates a new collection from a list of filenames.
+
+        Args:
+            *filenames (str): String with the filename.
+
+        Returns:
+            CollectionLazy: Collection with the articles by reading the
+                filenames.
+        """
+        files = [open(filename, encoding="utf-8-sig") for filename in filenames]
+        return cls(*files)
+
+    @property
+    def files(self):
+        """Iterates over all files in the collection
+
+        Returns:
+            generator: A generator of stream files.
+        """
+        yield from self._files
+
+    @property
+    def _article_texts(self):
+        """Iterates over all the single article texts in the colection.
+
+        Returns:
+            generator: A generator of strings with the text articles.
+        """
+        for filehandle in self.files:
+            filehandle.seek(0)
+            data = filehandle.read()
+            filehandle.seek(0)
+            for article_text in data.split("\n\n"):
+                if article_text != "EF":
+                    yield article_text
+
+    @property
+    def _articles(self) -> Iterable[Article]:
+        for article_text in self._article_texts:
+            yield Article.from_isi_text(article_text)
+
+    @property
+    def articles(self) -> Iterable[Article]:
+        """Iterates over all articles.
+
+        Returns:
+            generator: A generator of Articles according to the text articles.
+        """
+        for article in self._articles:
+            yield article
+            for reference in article.references:
+                try:
+                    yield Article.from_isi_citation(reference)
+                except ValueError:
+                    logger.warning(
+                        f"Ignoring malformed reference {reference} from {article.label}"
+                    )
+
+    def __len__(self):
+        return sum(1 for _ in self.articles)
+
+    @property
+    def authors(self) -> Iterable[str]:
+        """Iterates over all article authors, including duplicates
+
+        Returns:
+            generator: A generator with the authors (one by one) of the
+                articles in the collection.
+        """
+        for article in self.articles:
+            yield from article.authors
+
+    @property
+    def coauthors(self) -> Iterable[Tuple[str, str]]:
+        """Iterates over coauthor pairs.
+
+        Returns:
+            generator: A generator with the pair of coauthors of the articles
+                in the collections.
+        """
+        for article in self._articles:
+            yield from (
+                (source, target)
+                for source, target in itertools.combinations(sorted(article.authors), 2)
+            )
+
+    def citation_pairs(self) -> Iterable[Tuple[Article, Article]]:
+        """Computes the citation pairs for the articles in the collection.
+
+        Returns:
+            genertator: A generator with the citation links: pairs of article
+            labesl, where the firts element is the article which cites the
+            second element.
+        """
+        for article in self._articles:
+            for reference in article.references:
+                try:
+                    yield (article, Article.from_isi_citation(reference))
+                except ValueError:
+                    logger.warning(
+                        f"Found a malformed reference from {article.label}: {reference}"
+                    )
diff --git a/wostools/wostools.py b/wostools/wostools.py
deleted file mode 100644
index 537e924..0000000
--- a/wostools/wostools.py
+++ /dev/null
@@ -1,330 +0,0 @@
-"""
-The whole wostools thing.
-"""
-
-import collections
-import glob
-import itertools
-import re
-from typing import Dict, Callable, Optional, Tuple, TypeVar, Iterable
-
-from wostools.fields import preprocess
-
-
-LABEL_ATTRIBUTES = {
-    "AU": lambda au: au[0].replace(",", ""),
-    "PY": lambda py: py[0],
-    "J9": lambda j9: j9[0],
-    "VL": lambda vl: f"V{vl[0]}",
-    "BP": lambda bp: f"P{bp[0]}",
-    "DI": lambda di: f"DOI {di[0]}",
-}
-
-
-_T = TypeVar("T")
-_V = TypeVar("V")
-
-
-class WosToolsError(Exception):
-    """
-    All the errors go here.
-    """
-
-    pass
-
-
-def parse_label(label: str) -> Dict:
-    pattern = re.compile(
-        r"""^(?P<AU>[^,]+)?,[ ]         # First author
-            (?P<PY>\d{4})?,[ ]          # Publication year
-            (?P<J9>[^,]+)?              # Journal
-            (,[ ]V(?P<VL>[\w\d-]+))?    # Volume
-            (,[ ][Pp](?P<BP>\d+))?      # Start page
-            (,[ ]DOI[ ](?P<DI>.+))?     # The all important DOI
-            """,
-        re.X,
-    )
-
-    default_value = {attr: 0 if attr == "PY" else None for attr in LABEL_ATTRIBUTES}
-
-    match_result = pattern.match(label)
-    if match_result:
-        match_dict = match_result.groupdict()
-        match_dict["PY"] = int(match_dict["PY"] or 0)
-        return match_dict
-    else:
-        return default_value
-
-
-class Article(object):
-    """
-    Abstract a WoS article. It creates some structures to manage the data
-        related to an article. All the fields could be called as attributes.
-        Finally, it contains a method to return a sanitized (and hope unique)
-        label.
-
-    Args:
-        article_text (str): A string containing the record for a WoS article.
-    """
-
-    def __init__(self, article_text):
-        if article_text.startswith("FN"):
-            article_text = "\n".join(article_text.split("\n")[2:])
-
-        self.__article_text = article_text
-        self.__raw_data = Article.__article_text_to_dict(article_text)
-        self.__processed_data = preprocess(self.__raw_data)
-
-    def __getattr__(self, name):
-        if name not in self.__processed_data:
-            raise AttributeError(
-                f"{self.__class__.__name__} does not have an attribute {name}"
-            )
-        return self.__processed_data[name]
-
-    @property
-    def label_attrs(self):
-        return {attr: self.__processed_data.get(attr) for attr in LABEL_ATTRIBUTES}
-
-    @property
-    def label(self):
-        """Builds a label using the fields ["AU", "PY", "J9", "VL", "PG", "DI"].
-
-        Returns:
-            str: A label with those required fields separated by a comma.
-        """
-
-        normalized_fields = [
-            normalizer(self.__raw_data[field])
-            for field, normalizer in LABEL_ATTRIBUTES.items()
-            if self.__raw_data.get(field)
-        ]
-
-        label = ", ".join(normalized_fields)
-        return label
-
-    def __repr__(self):
-        return self.label
-
-    def keys(self):
-        return self.__raw_data.keys()
-
-    @property
-    def text(self):
-        return self.__article_text
-
-    @property
-    def raw_data(self):
-        return self.__raw_data
-
-    @property
-    def data(self):
-        return self.__processed_data
-
-    @staticmethod
-    def __article_text_to_dict(article_text: str):
-        """Translates an article text into a dict using the WoS field tags:
-                http://wos-resources.roblib.upei.ca/WOK46/help/WOK/hft_wos.html
-
-        Args:
-            article_text (str): String with the text of the record for an article.
-
-        Returns:
-            dict: A dict where the keys are the Web of Science Field Tags and the
-                values are the content of the passed article.
-        """
-
-        if article_text.startswith("FN"):
-            article_text = "\n".join(article_text.split("\n")[2:])
-
-        # Fix little bug with isi files
-        if article_text.startswith("null"):
-            article_text = article_text[4:]
-
-        data = collections.defaultdict(list)
-        field = ""
-        for line in re.split(r"\n+", article_text):
-            name = line[:2]
-            value = line[3:]
-
-            if not name.isspace():
-                field = name
-
-            if field != "ER":
-                data[field].append(value)
-        return dict(data)
-
-    def __contains__(self, value):
-        return value in self.__processed_data
-
-
-class CollectionLazy(object):
-    """A collection of WOS text files.
-
-    Args:
-        *filenames (str): Strings with the names of the files containing
-            articles.
-    """
-
-    def __init__(self, *files):
-        self.__files = files
-        for file in self.__files:
-            file.seek(0)
-
-    @classmethod
-    def from_glob(cls, pattern):
-        """Creates a new collection from a pattern using glob.
-
-        Args:
-            pattern (str): String with the pattern to be passed to glob.
-
-        Returns:
-            CollectionLazy: Collection with the articles by using the pattern.
-        """
-        return cls.from_filenames(*glob.glob(pattern))
-
-    @classmethod
-    def from_filenames(cls, *filenames):
-        """Creates a new collection from a list of filenames.
-
-        Args:
-            *filenames (str): String with the filename.
-
-        Returns:
-            CollectionLazy: Collection with the articles by reading the
-                filenames.
-        """
-        files = []
-        for filename in filenames:
-            try:
-                files.append(open(filename, encoding="utf-8-sig"))
-            except FileNotFoundError:
-                raise WosToolsError(f"The file {filename} was not found")
-        return cls(*files)
-
-    @property
-    def files(self):
-        """Iterates over all files in the collection
-
-        Returns:
-            generator: A generator of stream files.
-        """
-        for filehandle in self.__files:
-            yield filehandle
-
-    @property
-    def __article_texts(self):
-        """Iterates over all the single article texts in the colection.
-
-        Returns:
-            generator: A generator of strings with the text articles.
-        """
-        for filehandle in self.files:
-            filehandle.seek(0)
-            data = filehandle.read()
-            filehandle.seek(0)
-            for article_text in data.split("\n\n"):
-                if article_text != "EF":
-                    yield article_text
-
-    @property
-    def articles(self):
-        """Iterates over all articles.
-
-        Returns:
-            generator: A generator of Articles according to the text articles.
-        """
-        uniques = set()
-        for article_text in self.__article_texts:
-            article = Article(article_text)
-            if article.label not in uniques:
-                uniques.add(article.label)
-                yield article
-            else:
-                continue
-
-    def __len__(self):
-        count = 0
-        for _ in self.articles:
-            count += 1
-        return count
-
-    @property
-    def authors(self):
-        """Iterates over all article authors, including duplicates
-
-        Returns:
-            generator: A generator with the authors (one by one) of the
-                articles in the collection.
-        """
-        authors = (article.AF for article in self.articles if hasattr(article, "AF"))
-        return itertools.chain(*authors)
-
-    @property
-    def coauthors(self):
-        """Iterates over coauthor pairs.
-
-        Returns:
-            generator: A generator with the pair of coauthors of the articles
-                in the collections.
-        """
-        authors_by_article = (
-            article.AF for article in self.articles if hasattr(article, "AF")
-        )
-        return itertools.chain(
-            *(
-                itertools.combinations(sorted(authors), 2)
-                for authors in authors_by_article
-            )
-        )
-
-    def completeness(self):
-        """Computes the completeness of the collection by key.
-
-        Returns:
-            dict: A dictionary where the keys are strings corresponding to the
-                WoS field tags and the values are the ratio between the articles
-                containing that field and the total number of articles. E.g., if
-                all the articles contain the field AF, the completeness for the
-                tag AF is 1. On the other hand, e.g., if the half of the articles
-                contain the tag DI while the other half do not, the completeness
-                for the tag DI is 0.5.
-        """
-        counters = collections.defaultdict(int)
-        total = 0
-        for article in self.articles:
-            total += 1
-            for key in article.keys():
-                counters[key] += 1
-        return {key: val / total for key, val in counters.items()}
-
-    @staticmethod
-    def metadata_pair_parser(
-        article: Article, reference: str
-    ) -> Tuple[Tuple[str, Dict], Tuple[str, Dict]]:
-        """
-        Convenience function to pass to `citation_pairs` so that we get in 
-        each side of a citation the respective labels and attributes.
-        """
-        return (
-            (article.label, article.label_attrs),
-            (reference, parse_label(reference)),
-        )
-
-    def citation_pairs(
-        self, pair_parser: Optional[Callable[[Article, str], Tuple[_T, _V]]] = None
-    ) -> Iterable[Tuple[_T, _V]]:
-        """Computes the citation pairs for the articles in the collection.
-
-        Returns:
-            genertator: A generator with the citation links: pairs of article
-            labesl, where the firts element is the article which cites the
-            second element.
-        """
-        if pair_parser is None:
-            pair_parser = lambda a, r: (a.label, r)
-        yield from (
-            pair_parser(article, reference)
-            for article in self.articles
-            for reference in article.references
-        )

From da9102276ee4045d123fff2c1199e539551438e5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oscar=20Arbel=C3=A1ez?= <odarbelaeze@gmail.com>
Date: Fri, 5 Jun 2020 00:50:58 -0500
Subject: [PATCH 02/35] Implement the cached version of wostools

---
 tests/conftest.py      |  14 ++--
 tests/test_wostools.py |  61 ++++++++++-------
 wostools/__init__.py   |   3 +-
 wostools/cached.py     | 150 +++++++++++++++++++++++++++++++++++++++++
 wostools/cli.py        |   7 +-
 wostools/lazy.py       |  19 ++----
 6 files changed, 206 insertions(+), 48 deletions(-)
 create mode 100644 wostools/cached.py

diff --git a/tests/conftest.py b/tests/conftest.py
index 5003507..fa0db46 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -2,7 +2,7 @@
 Configuration file for python-wostools tests.
 """
 
-from wostools import Article, CollectionLazy
+from wostools import Article, CollectionLazy, Collection
 
 import pytest
 import io
@@ -104,11 +104,11 @@ def filename_many_documents():
     return "docs/examples/bit-pattern-savedrecs.txt"
 
 
-@pytest.fixture
-def collection_single_document(filename_single_document):
-    return CollectionLazy.from_filenames(filename_single_document)
+@pytest.fixture(params=[Collection, CollectionLazy])
+def collection_single_document(request, filename_single_document):
+    return request.param.from_filenames(filename_single_document)
 
 
-@pytest.fixture
-def collection_many_documents(filename_many_documents):
-    return CollectionLazy.from_filenames(filename_many_documents)
+@pytest.fixture(params=[Collection, CollectionLazy])
+def collection_many_documents(request, filename_many_documents):
+    return request.param.from_filenames(filename_many_documents)
diff --git a/tests/test_wostools.py b/tests/test_wostools.py
index 7236270..fedfbeb 100644
--- a/tests/test_wostools.py
+++ b/tests/test_wostools.py
@@ -2,7 +2,7 @@
 
 from click.testing import CliRunner
 
-from wostools import CollectionLazy
+from wostools import CollectionLazy, Collection
 from wostools import cli
 from wostools import Article
 import pytest
@@ -161,20 +161,21 @@ def test_collection_from_filenames(collection_many_documents):
     for article in collection_many_documents.articles:
         assert isinstance(article, Article)
 
-    for file in collection_many_documents.files:
+    for file in collection_many_documents._files:
         assert hasattr(file, "read")
         assert isinstance(file, (io.StringIO, io.TextIOWrapper))
         assert file.tell() == 0
 
 
-def test_collection_from_glob():
-    collection = CollectionLazy.from_glob("docs/examples/*.txt")
+@pytest.mark.parametrize("cls,count", [(CollectionLazy, 13892), (Collection, 8797),])
+def test_collection_from_glob(cls, count):
+    collection = cls.from_glob("docs/examples/*.txt")
     for article in collection.articles:
         assert isinstance(article, Article)
 
-    assert len(list(collection.articles)) == 13892
+    assert len(list(collection.articles)) == count
 
-    for file in collection.files:
+    for file in collection._files:
         assert hasattr(file, "read")
         assert isinstance(file, (io.StringIO, io.TextIOWrapper))
         assert file.tell() == 0
@@ -188,42 +189,54 @@ def test_collection_from_streams(filename_single_document):
         for article in collection.articles:
             assert isinstance(article, Article)
 
-        for file in collection.files:
+        for file in collection._files:
             assert hasattr(file, "read")
             assert isinstance(file, (io.StringIO, io.TextIOWrapper))
             assert file.tell() == 0
 
 
 def test_collection_with_duplicated(filename_single_document, filename_many_documents):
+    collection = CollectionLazy.from_filenames(filename_single_document)
+    assert len(list(collection._files)) == 1
+    assert len(list(collection.articles)) == 29
+
     collection = CollectionLazy.from_filenames(
         filename_single_document, filename_single_document, filename_single_document
     )
-    assert len(list(collection.files)) == 3
-    assert len(list(collection.articles)) == 87
+    assert len(list(collection._files)) == 3
+    assert len(list(collection.articles)) == 3 * 29
 
-    collection = CollectionLazy.from_filenames(
-        filename_many_documents, filename_many_documents, filename_many_documents
+
+def test_cached_collection_with_duplicated(
+    filename_single_document, filename_many_documents
+):
+    collection = Collection.from_filenames(filename_single_document)
+    assert len(list(collection._files)) == 1
+    assert len(list(collection.articles)) == 29
+
+    collection = Collection.from_filenames(
+        filename_single_document, filename_single_document
     )
-    assert len(list(collection.files)) == 3
-    assert len(list(collection.articles)) == 41589
+    assert len(list(collection._files)) == 2
+    assert len(list(collection.articles)) == 29
 
 
 def test_collection_authors(collection_single_document):
-    authors = collection_single_document.authors
-    assert next(authors) == "Wodarz, S"
-    assert next(authors) == "Hasegawa, T"
-    assert next(authors) == "Ishio, S"
-    assert next(authors) == "Homma, T"
+    assert {"Wodarz, S", "Hasegawa, T", "Ishio, S", "Homma, T"}.issubset(
+        set(collection_single_document.authors)
+    )
 
 
 def test_collection_coauthors(collection_single_document):
     coauthors = collection_single_document.coauthors
-    assert next(coauthors) == ("Hasegawa, T", "Homma, T")
-    assert next(coauthors) == ("Hasegawa, T", "Ishio, S")
-    assert next(coauthors) == ("Hasegawa, T", "Wodarz, S")
-    assert next(coauthors) == ("Homma, T", "Ishio, S")
-    assert next(coauthors) == ("Homma, T", "Wodarz, S")
-    assert next(coauthors) == ("Ishio, S", "Wodarz, S")
+    assert {
+        ("Hasegawa, T", "Homma, T"),
+        ("Hasegawa, T", "Ishio, S"),
+        ("Hasegawa, T", "Wodarz, S"),
+        ("Homma, T", "Ishio, S"),
+        ("Homma, T", "Wodarz, S"),
+        ("Ishio, S", "Wodarz, S"),
+    }.issubset(set(coauthors))
 
 
 def test_command_line_interface():
diff --git a/wostools/__init__.py b/wostools/__init__.py
index 47315ea..75ebed7 100644
--- a/wostools/__init__.py
+++ b/wostools/__init__.py
@@ -6,5 +6,6 @@
 
 from wostools.article import Article
 from wostools.lazy import CollectionLazy
+from wostools.cached import CollectionCached as Collection
 
-__all__ = ["CollectionLazy", "Article"]
+__all__ = ["Collection", "CollectionLazy", "Article"]
diff --git a/wostools/cached.py b/wostools/cached.py
new file mode 100644
index 0000000..ba2fb82
--- /dev/null
+++ b/wostools/cached.py
@@ -0,0 +1,150 @@
+"""
+Collection with a nice cache.
+"""
+
+import collections
+import glob
+import itertools
+import logging
+import re
+from typing import Callable, Dict, Iterable, Optional, Tuple, TypeVar, Union
+
+from wostools.article import Article
+
+logger = logging.getLogger(__name__)
+
+
+class CollectionCached(object):
+    """A collection of WOS text files.
+
+    Args:
+        *filenames (str): Strings with the names of the files containing
+            articles.
+    """
+
+    def __init__(self, *files):
+        self._files = files
+        for file in self._files:
+            file.seek(0)
+        self._cache_key = None
+        self._cache: Dict[str, Article] = {}
+        self._preheat()
+
+    def _add_article(self, article):
+        label = article.label
+        if label in self._cache:
+            article = article.merge(self._cache[label])
+        self._cache[label] = article
+
+    def _preheat(self):
+        # Preheat our cache
+        key = ":".join(str(id(file) for file in self._files))
+        if key == self._cache_key:
+            return
+        for article in self._articles:
+            self._add_article(article)
+            for reference in article.references:
+                try:
+                    self._add_article(Article.from_isi_citation(reference))
+                except ValueError:
+                    logger.info(
+                        f"Ignoring malformed reference '{reference}' from '{article.label}'"
+                    )
+        self._cache_key = key
+
+    @classmethod
+    def from_glob(cls, pattern):
+        """Creates a new collection from a pattern using glob.
+
+        Args:
+            pattern (str): String with the pattern to be passed to glob.
+
+        Returns:
+            CollectionLazy: Collection with the articles by using the pattern.
+        """
+        return cls.from_filenames(*glob.glob(pattern))
+
+    @classmethod
+    def from_filenames(cls, *filenames):
+        """Creates a new collection from a list of filenames.
+
+        Args:
+            *filenames (str): String with the filename.
+
+        Returns:
+            CollectionLazy: Collection with the articles by reading the
+                filenames.
+        """
+        files = [open(filename, encoding="utf-8-sig") for filename in filenames]
+        return cls(*files)
+
+    @property
+    def _article_texts(self) -> Iterable[str]:
+        """Iterates over all the single article texts in the colection.
+
+        Returns:
+            generator: A generator of strings with the text articles.
+        """
+        for filehandle in self._files:
+            filehandle.seek(0)
+            data = filehandle.read()
+            filehandle.seek(0)
+            for article_text in data.split("\n\n"):
+                if article_text != "EF":
+                    yield article_text
+
+    @property
+    def _articles(self) -> Iterable[Article]:
+        for article_text in self._article_texts:
+            yield Article.from_isi_text(article_text)
+
+    @property
+    def articles(self) -> Iterable[Article]:
+        """Iterates over all articles.
+
+        Returns:
+            generator: A generator of Articles according to the text articles.
+        """
+        self._preheat()
+        yield from self._cache.values()
+
+    def __len__(self):
+        return sum(1 for _ in self.articles)
+
+    @property
+    def authors(self) -> Iterable[str]:
+        """Iterates over all article authors, including duplicates
+
+        Returns:
+            generator: A generator with the authors (one by one) of the
+                articles in the collection.
+        """
+        for article in self.articles:
+            yield from article.authors
+
+    @property
+    def coauthors(self) -> Iterable[Tuple[str, str]]:
+        """Iterates over coauthor pairs.
+
+        Returns:
+            generator: A generator with the pair of coauthors of the articles
+                in the collections.
+        """
+        for article in self._articles:
+            yield from (
+                (source, target)
+                for source, target in itertools.combinations(sorted(article.authors), 2)
+            )
+
+    def citation_pairs(self) -> Iterable[Tuple[Article, Article]]:
+        """Computes the citation pairs for the articles in the collection.
+
+        Returns:
+            genertator: A generator with the citation links: pairs of article
+            labesl, where the firts element is the article which cites the
+            second element.
+        """
+        for article in self._cache.values():
+            for reference in article.references:
+                if reference in self._cache:
+                    yield (article, self._cache[reference])
diff --git a/wostools/cli.py b/wostools/cli.py
index d54edd3..74b8dfb 100644
--- a/wostools/cli.py
+++ b/wostools/cli.py
@@ -1,8 +1,9 @@
 import json
+import logging
 
 import click
 
-from wostools import CollectionLazy
+from wostools import Collection
 
 
 @click.group()
@@ -10,6 +11,8 @@ def main():
     """
     A little cli for wos tools.
     """
+    logger = logging.getLogger("wostools")
+    logger.setLevel(logging.ERROR)
 
 
 @main.command("citation-pairs")
@@ -30,7 +33,7 @@ def citation_pairs(sources, output):
         click.secho("You should give at least a file with documents.", fg="red")
         return
 
-    collection = CollectionLazy.from_filenames(*[f.name for f in sources])
+    collection = Collection.from_filenames(*[f.name for f in sources])
     pairs = [
         (source.label, target.label) for source, target in collection.citation_pairs()
     ]
diff --git a/wostools/lazy.py b/wostools/lazy.py
index 4604b0e..78236e2 100644
--- a/wostools/lazy.py
+++ b/wostools/lazy.py
@@ -53,15 +53,6 @@ def from_filenames(cls, *filenames):
         files = [open(filename, encoding="utf-8-sig") for filename in filenames]
         return cls(*files)
 
-    @property
-    def files(self):
-        """Iterates over all files in the collection
-
-        Returns:
-            generator: A generator of stream files.
-        """
-        yield from self._files
-
     @property
     def _article_texts(self):
         """Iterates over all the single article texts in the colection.
@@ -69,7 +60,7 @@ def _article_texts(self):
         Returns:
             generator: A generator of strings with the text articles.
         """
-        for filehandle in self.files:
+        for filehandle in self._files:
             filehandle.seek(0)
             data = filehandle.read()
             filehandle.seek(0)
@@ -95,8 +86,8 @@ def articles(self) -> Iterable[Article]:
                 try:
                     yield Article.from_isi_citation(reference)
                 except ValueError:
-                    logger.warning(
-                        f"Ignoring malformed reference {reference} from {article.label}"
+                    logger.info(
+                        f"Ignoring malformed reference '{reference}' from '{article.label}'"
                     )
 
     def __len__(self):
@@ -140,6 +131,6 @@ def citation_pairs(self) -> Iterable[Tuple[Article, Article]]:
                 try:
                     yield (article, Article.from_isi_citation(reference))
                 except ValueError:
-                    logger.warning(
-                        f"Found a malformed reference from {article.label}: {reference}"
+                    logger.info(
+                        f"Ignoring malformed reference '{reference}' from '{article.label}'"
                     )

From 241245f83153e39ced19b30e195d1bb7db468a68 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oscar=20Arbel=C3=A1ez?= <odarbelaeze@gmail.com>
Date: Fri, 5 Jun 2020 01:07:34 -0500
Subject: [PATCH 03/35] Add the all mighty dict representation function

---
 wostools/article.py | 39 ++++++++++++++++++++++++++++++++-------
 wostools/cli.py     | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 66 insertions(+), 7 deletions(-)

diff --git a/wostools/article.py b/wostools/article.py
index 42b2426..8ad96e3 100644
--- a/wostools/article.py
+++ b/wostools/article.py
@@ -1,7 +1,7 @@
+import collections
 import logging
 import re
-import collections
-from typing import List, Optional, Mapping, TypeVar, Any
+from typing import Any, List, Mapping, Optional, Set, TypeVar
 
 from wostools.fields import parse_all
 
@@ -33,7 +33,7 @@ def __init__(
         page: Optional[str] = None,
         doi: Optional[str] = None,
         references: Optional[List[str]] = None,
-        sources: Optional[List[str]] = None,
+        sources: Optional[Set[str]] = None,
         extra: Optional[Mapping] = None,
     ):
         self.title: Optional[str] = title
@@ -44,7 +44,7 @@ def __init__(
         self.page: Optional[str] = page
         self.doi: Optional[str] = doi
         self.references: List[str] = references or []
-        self.sources: List[str] = sources or []
+        self.sources: Set[str] = sources or set()
         self.extra: Mapping[str, Any] = extra or {}
 
     @property
@@ -61,6 +61,31 @@ def label(self):
         }
         return ", ".join(value for value in pieces.values() if value)
 
+    def to_dict(self, simplified=True):
+        """
+        Transform the article into some key value pairs for easy transportation.
+        """
+        extra = (
+            {
+                "references": self.references,
+                "extra": self.extra,
+                "sources": list(self.sources),
+            }
+            if not simplified
+            else {}
+        )
+        return {
+            "title": self.title,
+            "authors": self.authors,
+            "keywords": self.extra.get("keywords", []),
+            "year": self.year,
+            "journal": self.journal,
+            "volume": self.volume,
+            "page": self.page,
+            "doi": self.doi,
+            **extra,
+        }
+
     def merge(self, other: "Article") -> "Article":
         if self.label != other.label:
             logger.warning(
@@ -74,7 +99,7 @@ def merge(self, other: "Article") -> "Article":
             volume=self.volume or other.volume,
             page=self.page or other.page,
             doi=self.doi or other.doi,
-            sources=[*self.sources, *other.sources],
+            sources={*self.sources, *other.sources},
             extra={**self.extra, **other.extra},
         )
 
@@ -102,7 +127,7 @@ def from_isi_text(cls, raw: str) -> "Article":
             doi=processed.get("DOI"),
             references=processed.get("references"),
             extra=processed,
-            sources=[raw],
+            sources={raw},
         )
 
     @classmethod
@@ -121,5 +146,5 @@ def from_isi_citation(cls, citation: str) -> "Article":
             page=processed.get("beginning_page"),
             doi=processed.get("DOI"),
             extra=processed,
-            sources=[citation],
+            sources={citation},
         )
diff --git a/wostools/cli.py b/wostools/cli.py
index 74b8dfb..fa4f6dc 100644
--- a/wostools/cli.py
+++ b/wostools/cli.py
@@ -39,3 +39,37 @@ def citation_pairs(sources, output):
     ]
 
     json.dump(pairs, output, indent=2)
+
+
+@main.command("to-dict")
+@click.argument("sources", type=click.File("r"), nargs=-1)
+@click.option(
+    "--output",
+    type=click.File("w"),
+    show_default=True,
+    default="-",
+    help="File to save json otuput.",
+)
+@click.option(
+    "-m",
+    "--more",
+    is_flag=True,
+    show_default=True,
+    default=False,
+    help="Add extra info to the output",
+)
+def to_dict(sources, output, more):
+    """
+    Build a collection by using the sources and print the citation pairs in json
+    format or dumps them in the `output`.
+    """
+    if not len(sources) > 0:
+        click.secho("You should give at least a file with documents.", fg="red")
+        return
+
+    collection = Collection.from_filenames(*[f.name for f in sources])
+    json.dump(
+        [article.to_dict(simplified=not more) for article in collection.articles],
+        output,
+        indent=2,
+    )

From 6901e05313699d820b959739825e16eebb6c1c95 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oscar=20Arbel=C3=A1ez?= <odarbelaeze@gmail.com>
Date: Fri, 5 Jun 2020 10:56:36 -0500
Subject: [PATCH 04/35] Get rid of some travis config

---
 .flake8                             |  7 +++++
 .github/workflows/pythonpackage.yml |  4 +--
 .travis.yml                         | 27 -----------------
 Makefile                            |  3 --
 docs/wostools.rst                   | 46 +++++++++++++++++++----------
 setup.cfg                           |  1 -
 tests/test_wostools.py              |  2 +-
 tox.ini                             | 24 ---------------
 wostools/article.py                 |  2 +-
 wostools/cached.py                  | 13 +++-----
 wostools/lazy.py                    |  8 ++---
 11 files changed, 49 insertions(+), 88 deletions(-)
 create mode 100644 .flake8
 delete mode 100644 .travis.yml
 delete mode 100644 tox.ini

diff --git a/.flake8 b/.flake8
new file mode 100644
index 0000000..600f0cb
--- /dev/null
+++ b/.flake8
@@ -0,0 +1,7 @@
+[flake8]
+ignore = E203, E266, E501, W503
+# line length is intentionally set to 80 here because black uses Bugbear
+# See https://github.com/psf/black/blob/master/README.md#line-length for more details
+max-line-length = 89
+max-complexity = 18
+select = B,C,E,F,W,T4,B9
\ No newline at end of file
diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml
index cf2712e..7dd2afa 100644
--- a/.github/workflows/pythonpackage.yml
+++ b/.github/workflows/pythonpackage.yml
@@ -9,7 +9,7 @@ jobs:
     strategy:
       max-parallel: 4
       matrix:
-        python-version: [3.6, 3.7]
+        python-version: [3.6, 3.7, 3.8]
 
     steps:
     - uses: actions/checkout@v1
@@ -27,7 +27,7 @@ jobs:
         # stop the build if there are Python syntax errors or undefined names
         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+        flake8 . --count --exit-zero --statistics
     - name: Test with pytest
       run: |
         pip install pytest
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index 3659aa5..0000000
--- a/.travis.yml
+++ /dev/null
@@ -1,27 +0,0 @@
-# Config file for automatic testing at travis-ci.org
-
-language: python
-python:
-    - 3.6
-    - 3.7
-
-# Command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors
-install: pip install -U tox-travis
-
-# Command to run tests, e.g. python setup.py test
-script: tox
-
-# Assuming you have installed the travis-ci CLI tool, after you
-# create the Github repo and add it to Travis, run the
-# following command to finish PyPI deployment setup:
-# $ travis encrypt --add deploy.password
-deploy:
-    provider: pypi
-    distributions: sdist bdist_wheel
-    user: odarbelaeze
-    password:
-        secure: w9KxhyHxZq2HbWsjA/TutcgCsBKhEJQaTLOmFS82e0zZOS5vTuLAQ5dS89CeSwH8vS/whJkgdQcSwCoHQw/g72o+G7ndw6moxQz4wZKUnhh9Ls4EjCDiHM/jDl3rnZzduNzZbMl0TFTG8nw62OvwJIzmjelkvsN+DXy0Oa4hob+I4pEEvhvubvLcemxz+BCIKB+v24mD+t1OFUfkB1bxygA9ekzfRFxOHYR/ZJbwZM7J1+hEhyhsLiVBYDYU5Abx46R34x3OxS98suoW1wGeZMPi4tqBy2hYVZjhJGMAP+aIle3RgS6ld69w/R76RWggW1TDj1UYhMo6LUD77+6A5nHmSab8PEAGiP1pfrU8LDesZyv11Xrfd62Pf+jgbV/1BXnHamOG2YV07dTRVClo9KEvpvhseQnpL0KL1tQDfFW9Wbk2zFFNuhNlLBd6ER0EIu0wpuEvBOCweNnFYyQKYellMPon3P6ljPgNy2Qn1YCRlWVzCoumBpy49ej69DhT0Kt0Bi4VBf7dKAp6ETQFHSiSsJiPm3qY+DQg9UZ5KKSy1wwWd8mo5DvbVjC67uSmF5N4ap+OFhjUQBrcaqsF0/wpO87bse8hScU8e8LAKDzl9UKcyN9USZl0BY2TTTlHqeYiP7FwfJPU421kcd/lNo/Hu6tJBYboQi5MnmkbFug=
-    on:
-        tags: true
-        repo: coreofscience/python-wostools
-        python: 3.6
diff --git a/Makefile b/Makefile
index 741b84b..c89df53 100644
--- a/Makefile
+++ b/Makefile
@@ -56,9 +56,6 @@ lint: ## check style with flake8
 test: ## run tests quickly with the default Python
 	py.test
 
-test-all: ## run tests on every Python version with tox
-	tox
-
 coverage: ## check code coverage quickly with the default Python
 	coverage run --source wostools -m pytest
 	coverage report -m
diff --git a/docs/wostools.rst b/docs/wostools.rst
index 29a30be..b45ba29 100644
--- a/docs/wostools.rst
+++ b/docs/wostools.rst
@@ -4,35 +4,51 @@ wostools package
 Submodules
 ----------
 
+wostools.article module
+-----------------------
+
+.. automodule:: wostools.article
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+wostools.cached module
+----------------------
+
+.. automodule:: wostools.cached
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 wostools.cli module
 -------------------
 
 .. automodule:: wostools.cli
-    :members:
-    :undoc-members:
-    :show-inheritance:
+   :members:
+   :undoc-members:
+   :show-inheritance:
 
 wostools.fields module
 ----------------------
 
 .. automodule:: wostools.fields
-    :members:
-    :undoc-members:
-    :show-inheritance:
+   :members:
+   :undoc-members:
+   :show-inheritance:
 
-wostools.wostools module
-------------------------
+wostools.lazy module
+--------------------
 
-.. automodule:: wostools.wostools
-    :members:
-    :undoc-members:
-    :show-inheritance:
+.. automodule:: wostools.lazy
+   :members:
+   :undoc-members:
+   :show-inheritance:
 
 
 Module contents
 ---------------
 
 .. automodule:: wostools
-    :members:
-    :undoc-members:
-    :show-inheritance:
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/setup.cfg b/setup.cfg
index ae3380f..a5d6c66 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -26,4 +26,3 @@ test = pytest
 
 [tool:pytest]
 collect_ignore = ['setup.py']
-
diff --git a/tests/test_wostools.py b/tests/test_wostools.py
index fedfbeb..565af9d 100644
--- a/tests/test_wostools.py
+++ b/tests/test_wostools.py
@@ -167,7 +167,7 @@ def test_collection_from_filenames(collection_many_documents):
         assert file.tell() == 0
 
 
-@pytest.mark.parametrize("cls,count", [(CollectionLazy, 13892), (Collection, 8797),])
+@pytest.mark.parametrize("cls,count", [(CollectionLazy, 13892), (Collection, 8797)])
 def test_collection_from_glob(cls, count):
     collection = cls.from_glob("docs/examples/*.txt")
     for article in collection.articles:
diff --git a/tox.ini b/tox.ini
deleted file mode 100644
index bc28a7f..0000000
--- a/tox.ini
+++ /dev/null
@@ -1,24 +0,0 @@
-[tox]
-envlist = py36, py37, flake8
-
-[travis]
-python =
-    3.6: py36
-    3.7: py37
-
-[testenv:flake8]
-basepython = python
-deps = flake8
-commands = flake8 --ignore=E501 wostools
-
-[testenv]
-setenv =
-    PYTHONPATH = {toxinidir}
-deps =
-    -r{toxinidir}/requirements_dev.txt
-; If you want to make tox run the tests with the same versions, create a
-; requirements.txt with the pinned versions and uncomment the following line:
-;     -r{toxinidir}/requirements.txt
-commands =
-    pip install -U pip
-    py.test --basetemp={envtmpdir}
diff --git a/wostools/article.py b/wostools/article.py
index 8ad96e3..8726119 100644
--- a/wostools/article.py
+++ b/wostools/article.py
@@ -1,7 +1,7 @@
 import collections
 import logging
 import re
-from typing import Any, List, Mapping, Optional, Set, TypeVar
+from typing import Any, List, Mapping, Optional, Set
 
 from wostools.fields import parse_all
 
diff --git a/wostools/cached.py b/wostools/cached.py
index ba2fb82..a025823 100644
--- a/wostools/cached.py
+++ b/wostools/cached.py
@@ -2,12 +2,10 @@
 Collection with a nice cache.
 """
 
-import collections
 import glob
 import itertools
 import logging
-import re
-from typing import Callable, Dict, Iterable, Optional, Tuple, TypeVar, Union
+from typing import Dict, Iterable, Tuple
 
 from wostools.article import Article
 
@@ -15,11 +13,8 @@
 
 
 class CollectionCached(object):
-    """A collection of WOS text files.
-
-    Args:
-        *filenames (str): Strings with the names of the files containing
-            articles.
+    """
+    A collection of WOS text files.
     """
 
     def __init__(self, *files):
@@ -69,7 +64,7 @@ def from_filenames(cls, *filenames):
         """Creates a new collection from a list of filenames.
 
         Args:
-            *filenames (str): String with the filename.
+            filenames (str): String with the filename.
 
         Returns:
             CollectionLazy: Collection with the articles by reading the
diff --git a/wostools/lazy.py b/wostools/lazy.py
index 78236e2..8b1deff 100644
--- a/wostools/lazy.py
+++ b/wostools/lazy.py
@@ -2,12 +2,10 @@
 The whole wostools thing.
 """
 
-import collections
 import glob
 import itertools
 import logging
-import re
-from typing import Callable, Dict, Iterable, Optional, Tuple, TypeVar, Union
+from typing import Iterable, Tuple
 
 from wostools.article import Article
 
@@ -18,7 +16,7 @@ class CollectionLazy(object):
     """A collection of WOS text files.
 
     Args:
-        *filenames (str): Strings with the names of the files containing
+        filenames (str): Strings with the names of the files containing
             articles.
     """
 
@@ -44,7 +42,7 @@ def from_filenames(cls, *filenames):
         """Creates a new collection from a list of filenames.
 
         Args:
-            *filenames (str): String with the filename.
+            filenames (str): String with the filename.
 
         Returns:
             CollectionLazy: Collection with the articles by reading the

From f4fdd6b35291b1caab9eb0a9273490ded7ddf58d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oscar=20Arbel=C3=A1ez?= <odarbelaeze@gmail.com>
Date: Fri, 5 Jun 2020 12:03:41 -0500
Subject: [PATCH 05/35] Get tocs to a maintainable level

---
 .flake8               |   2 -
 .flake8.ini           |   5 ++
 AUTHORS.md            |   9 +++
 AUTHORS.rst           |  13 ----
 CONTRIBUTING.md       | 136 +++++++++++++++++++++++++++++++++++
 CONTRIBUTING.rst      | 128 ---------------------------------
 HISTORY.md            |  11 +++
 HISTORY.rst           |  15 ----
 MANIFEST.in           |  10 +--
 Makefile              |  11 ---
 README.md             |  58 +++++++++++++++
 README.rst            |  80 ---------------------
 docs/Makefile         |  20 ------
 docs/_static/.gitkeep |   0
 docs/authors.rst      |   1 -
 docs/conf.py          | 160 ------------------------------------------
 docs/contributing.rst |   1 -
 docs/history.rst      |   1 -
 docs/index.rst        |  20 ------
 docs/installation.rst |  51 --------------
 docs/make.bat         |  36 ----------
 docs/modules.rst      |   7 --
 docs/readme.rst       |   1 -
 docs/usage.rst        |   7 --
 docs/wostools.rst     |  54 --------------
 requirements_dev.txt  |   1 +
 setup.py              |   2 +-
 27 files changed, 226 insertions(+), 614 deletions(-)
 create mode 100644 .flake8.ini
 create mode 100644 AUTHORS.md
 delete mode 100644 AUTHORS.rst
 create mode 100644 CONTRIBUTING.md
 delete mode 100644 CONTRIBUTING.rst
 create mode 100644 HISTORY.md
 delete mode 100644 HISTORY.rst
 create mode 100644 README.md
 delete mode 100644 README.rst
 delete mode 100644 docs/Makefile
 delete mode 100644 docs/_static/.gitkeep
 delete mode 100644 docs/authors.rst
 delete mode 100644 docs/conf.py
 delete mode 100644 docs/contributing.rst
 delete mode 100644 docs/history.rst
 delete mode 100644 docs/index.rst
 delete mode 100644 docs/installation.rst
 delete mode 100644 docs/make.bat
 delete mode 100644 docs/modules.rst
 delete mode 100644 docs/readme.rst
 delete mode 100644 docs/usage.rst
 delete mode 100644 docs/wostools.rst

diff --git a/.flake8 b/.flake8
index 600f0cb..f8cc7ae 100644
--- a/.flake8
+++ b/.flake8
@@ -1,7 +1,5 @@
 [flake8]
 ignore = E203, E266, E501, W503
-# line length is intentionally set to 80 here because black uses Bugbear
-# See https://github.com/psf/black/blob/master/README.md#line-length for more details
 max-line-length = 89
 max-complexity = 18
 select = B,C,E,F,W,T4,B9
\ No newline at end of file
diff --git a/.flake8.ini b/.flake8.ini
new file mode 100644
index 0000000..f8cc7ae
--- /dev/null
+++ b/.flake8.ini
@@ -0,0 +1,5 @@
+[flake8]
+ignore = E203, E266, E501, W503
+max-line-length = 89
+max-complexity = 18
+select = B,C,E,F,W,T4,B9
\ No newline at end of file
diff --git a/AUTHORS.md b/AUTHORS.md
new file mode 100644
index 0000000..ed8d9a6
--- /dev/null
+++ b/AUTHORS.md
@@ -0,0 +1,9 @@
+# Credits
+
+## Development Lead
+
+-   Core of Science \<<dev@coreofscience.com>\>
+
+## Contributors
+
+None yet. Why not be the first?
diff --git a/AUTHORS.rst b/AUTHORS.rst
deleted file mode 100644
index e84601f..0000000
--- a/AUTHORS.rst
+++ /dev/null
@@ -1,13 +0,0 @@
-=======
-Credits
-=======
-
-Development Lead
-----------------
-
-* Core of Science <dev@coreofscience.com>
-
-Contributors
-------------
-
-None yet. Why not be the first?
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000..bbc149a
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,136 @@
+# Contributing
+
+Contributions are welcome, and they are greatly appreciated! Every
+little bit helps, and credit will always be given.
+
+You can contribute in many ways:
+
+## Types of Contributions
+
+### Report Bugs
+
+Report bugs at
+<https://github.com/coreofscience/python-wostools/issues>.
+
+If you are reporting a bug, please include:
+
+-   Your operating system name and version.
+-   Any details about your local setup that might be helpful in
+    troubleshooting.
+-   Detailed steps to reproduce the bug.
+
+### Fix Bugs
+
+Look through the GitHub issues for bugs. Anything tagged with \"bug\"
+and \"help wanted\" is open to whoever wants to implement it.
+
+### Implement Features
+
+Look through the GitHub issues for features. Anything tagged with
+\"enhancement\" and \"help wanted\" is open to whoever wants to
+implement it.
+
+### Write Documentation
+
+Python WoS tools could always use more documentation, whether as part of
+the official Python WoS tools docs, in docstrings, or even on the web in
+blog posts, articles, and such.
+
+### Submit Feedback
+
+The best way to send feedback is to file an issue at
+<https://github.com/coreofscience/python-wostools/issues>.
+
+If you are proposing a feature:
+
+-   Explain in detail how it would work.
+-   Keep the scope as narrow as possible, to make it easier to
+    implement.
+-   Remember that this is a volunteer-driven project, and that
+    contributions are welcome :)
+
+## Get Started!
+
+Ready to contribute? Here\'s how to set up [wostools]{.title-ref} for
+local development.
+
+1.  Fork the [wostools]{.title-ref} repo on GitHub.
+
+2.  Clone your fork locally:
+
+    ```bash
+    $ git clone git@github.com:your_name_here/python-wostools.git
+    ```
+
+3.  Install your local copy into a virtualenv. Assuming you have
+    virtualenvwrapper installed, this is how you set up your fork for
+    local development:
+
+    ```bash
+    $ mkvirtualenv wostools
+    $ cd wostools/
+    $ python setup.py develop
+    ```
+
+4.  Create a branch for local development:
+
+    ```bash
+    $ git checkout -b name-of-your-bugfix-or-feature
+    ```
+
+    Now you can make your changes locally.
+
+5.  When you\'re done making changes, check that your changes pass
+    flake8 and the tests, including testing other Python versions with
+    tox:
+
+    ```bash
+    $ flake8 wostools tests
+    $ python setup.py test or py.test
+    $ tox
+    ```
+
+    To get flake8 and tox, just pip install them into your virtualenv.
+
+6.  Commit your changes and push your branch to GitHub:
+
+    ```bash
+    $ git add .
+    $ git commit -m "Your detailed description of your changes."
+    $ git push origin name-of-your-bugfix-or-feature
+    ```
+
+7.  Submit a pull request through the GitHub website.
+
+## Pull Request Guidelines
+
+Before you submit a pull request, check that it meets these guidelines:
+
+1.  The pull request should include tests.
+2.  If the pull request adds functionality, the docs should be updated.
+    Put your new functionality into a function with a docstring, and add
+    the feature to the list in README.md.
+3.  The pull request should work for Python 3.6, and for PyPy. Check
+    <https://travis-ci.org/coreofscience/python-wostools/pull_requests>
+    and make sure that the tests pass for all supported Python versions.
+
+## Tips
+
+To run a subset of tests:
+
+```bash
+$ py.test tests.test_wostools
+```
+
+## Deploying
+
+A reminder for the maintainers on how to deploy. Make sure all your
+changes are committed (including an entry in HISTORY.rst). Then run:
+
+```bash
+$ bumpversion patch # possible: major / minor / patch
+$ git push
+$ git push --tags
+```
+
+Travis will then deploy to PyPI if tests pass.
diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst
deleted file mode 100644
index 13572cd..0000000
--- a/CONTRIBUTING.rst
+++ /dev/null
@@ -1,128 +0,0 @@
-.. highlight:: shell
-
-============
-Contributing
-============
-
-Contributions are welcome, and they are greatly appreciated! Every little bit
-helps, and credit will always be given.
-
-You can contribute in many ways:
-
-Types of Contributions
-----------------------
-
-Report Bugs
-~~~~~~~~~~~
-
-Report bugs at https://github.com/coreofscience/python-wostools/issues.
-
-If you are reporting a bug, please include:
-
-* Your operating system name and version.
-* Any details about your local setup that might be helpful in troubleshooting.
-* Detailed steps to reproduce the bug.
-
-Fix Bugs
-~~~~~~~~
-
-Look through the GitHub issues for bugs. Anything tagged with "bug" and "help
-wanted" is open to whoever wants to implement it.
-
-Implement Features
-~~~~~~~~~~~~~~~~~~
-
-Look through the GitHub issues for features. Anything tagged with "enhancement"
-and "help wanted" is open to whoever wants to implement it.
-
-Write Documentation
-~~~~~~~~~~~~~~~~~~~
-
-Python WoS tools could always use more documentation, whether as part of the
-official Python WoS tools docs, in docstrings, or even on the web in blog posts,
-articles, and such.
-
-Submit Feedback
-~~~~~~~~~~~~~~~
-
-The best way to send feedback is to file an issue at https://github.com/coreofscience/python-wostools/issues.
-
-If you are proposing a feature:
-
-* Explain in detail how it would work.
-* Keep the scope as narrow as possible, to make it easier to implement.
-* Remember that this is a volunteer-driven project, and that contributions
-  are welcome :)
-
-Get Started!
-------------
-
-Ready to contribute? Here's how to set up `wostools` for local development.
-
-1. Fork the `wostools` repo on GitHub.
-2. Clone your fork locally::
-
-    $ git clone git@github.com:your_name_here/python-wostools.git
-
-3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, this is how you set up your fork for local development::
-
-    $ mkvirtualenv wostools
-    $ cd wostools/
-    $ python setup.py develop
-
-4. Create a branch for local development::
-
-    $ git checkout -b name-of-your-bugfix-or-feature
-
-   Now you can make your changes locally.
-
-5. When you're done making changes, check that your changes pass flake8 and the
-   tests, including testing other Python versions with tox::
-
-    $ flake8 wostools tests
-    $ python setup.py test or py.test
-    $ tox
-
-   To get flake8 and tox, just pip install them into your virtualenv.
-
-6. Commit your changes and push your branch to GitHub::
-
-    $ git add .
-    $ git commit -m "Your detailed description of your changes."
-    $ git push origin name-of-your-bugfix-or-feature
-
-7. Submit a pull request through the GitHub website.
-
-Pull Request Guidelines
------------------------
-
-Before you submit a pull request, check that it meets these guidelines:
-
-1. The pull request should include tests.
-2. If the pull request adds functionality, the docs should be updated. Put
-   your new functionality into a function with a docstring, and add the
-   feature to the list in README.rst.
-3. The pull request should work for Python 3.6, and for PyPy. Check
-   https://travis-ci.org/coreofscience/python-wostools/pull_requests
-   and make sure that the tests pass for all supported Python versions.
-
-Tips
-----
-
-To run a subset of tests::
-
-$ py.test tests.test_wostools
-
-
-Deploying
----------
-
-A reminder for the maintainers on how to deploy.
-Make sure all your changes are committed (including an entry in HISTORY.rst).
-Then run::
-
-$ bumpversion patch # possible: major / minor / patch
-$ git push
-$ git push --tags
-
-Travis will then deploy to PyPI if tests pass.
diff --git a/HISTORY.md b/HISTORY.md
new file mode 100644
index 0000000..0abe58c
--- /dev/null
+++ b/HISTORY.md
@@ -0,0 +1,11 @@
+# History
+
+## 0.2.0 (2018-08-12)
+
+-   Add support for all WOS fields.
+-   Add graph building support.
+-   Add a little cli for common tasks.
+
+## 0.1.1 (2018-05-10)
+
+-   First release on PyPI.
diff --git a/HISTORY.rst b/HISTORY.rst
deleted file mode 100644
index e5b3e00..0000000
--- a/HISTORY.rst
+++ /dev/null
@@ -1,15 +0,0 @@
-=======
-History
-=======
-
-0.2.0 (2018-08-12)
-------------------
-
-* Add support for all WOS fields.
-* Add graph building support.
-* Add a little cli for common tasks.
-
-0.1.1 (2018-05-10)
-------------------
-
-* First release on PyPI.
diff --git a/MANIFEST.in b/MANIFEST.in
index 965b2dd..68e47f0 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,11 +1,11 @@
-include AUTHORS.rst
-include CONTRIBUTING.rst
-include HISTORY.rst
+include AUTHORS.md
+include CONTRIBUTING.md
+include HISTORY.md
 include LICENSE
-include README.rst
+include README.md
 
 recursive-include tests *
 recursive-exclude * __pycache__
 recursive-exclude * *.py[co]
 
-recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif
+recursive-include docs *.md conf.py Makefile make.bat *.jpg *.png *.gif
diff --git a/Makefile b/Makefile
index c89df53..1c88bfb 100644
--- a/Makefile
+++ b/Makefile
@@ -62,17 +62,6 @@ coverage: ## check code coverage quickly with the default Python
 	coverage html
 	$(BROWSER) htmlcov/index.html
 
-docs: ## generate Sphinx HTML documentation, including API docs
-	rm -f docs/wostools.rst
-	rm -f docs/modules.rst
-	sphinx-apidoc -o docs/ wostools
-	$(MAKE) -C docs clean
-	$(MAKE) -C docs html
-	$(BROWSER) docs/_build/html/index.html
-
-servedocs: docs ## compile the docs watching for changes
-	watchmedo shell-command -p '*.rst' -c '$(MAKE) -C docs html' -R -D .
-
 release: dist ## package and upload a release
 	twine upload dist/*
 
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..c84b228
--- /dev/null
+++ b/README.md
@@ -0,0 +1,58 @@
+# Python WoS tools
+
+[![Updates](https://pyup.io/repos/github/coreofscience/python-wostools/shield.svg)](https://pyup.io/repos/github/coreofscience/python-wostools/)
+
+[![image](https://img.shields.io/pypi/v/wostools.svg)](https://pypi.python.org/pypi/wostools)
+
+[![DOI: 10.5281/zenodo.1344261](https://zenodo.org/badge/94160457.svg)](https://zenodo.org/badge/latestdoi/94160457)
+
+Translates ISI Web of Knowledge files into python objects.
+
+-   Free software: MIT license
+
+## Quickstart
+
+Install the library by:
+
+```bash
+$ pip install wostools
+```
+
+Say you want to grab the title of all the articles in an ISI file, you
+can grab [this example file](docs/examples/bit-pattern-savedrecs.txt).
+
+```python
+>>> from wostools import Collection
+>>> collection = Collection.from_filenames('docs/examples/bit-pattern-savedrecs.txt')
+>>> for article in collection.articles:
+...     print(article.title)
+In situ grazing incidence small-angle X-ray scattering study of solvent vapor annealing in lamellae-forming block copolymer thin films: Trade-off of defects in deswelling
+Structural control of ultra-fine CoPt nanodot arrays via electrodeposition process
+Porphyrin-based Pt/Pd-containing metallopolymers: Synthesis, characterization, optical property and potential application in bioimaging
+Syntheses and Controllable Self-Assembly of Luminescence Platinum(II) Plane-Coil Diblock Copolymers
+# ...
+```
+
+Never fear wostools cli is here. To help you do some common tasks right
+from your terminal.
+
+```bash
+$ wostools --help
+$ # To extract all the properties in a json file
+$ wostools to-json docs/examples/bit-pattern-savedrecs.txt --output=document.json
+```
+
+## Features
+
+-   Just parses an ISI Web of Knowledge file and produces a native
+    python object.
+-   Through the `CollectionLazy` object it can do this using the minimum
+    amount of memory it can possibly do.
+-   It has a cli to extract documents and citation pairs for you :smile:
+
+## Credits
+
+This package was created with
+[Cookiecutter](https://github.com/audreyr/cookiecutter) and the
+[audreyr/cookiecutter-pypackage](https://github.com/audreyr/cookiecutter-pypackage)
+project template.
diff --git a/README.rst b/README.rst
deleted file mode 100644
index b090cab..0000000
--- a/README.rst
+++ /dev/null
@@ -1,80 +0,0 @@
-================
-Python WoS tools
-================
-
-
-.. image:: https://pyup.io/repos/github/coreofscience/python-wostools/shield.svg
-     :target: https://pyup.io/repos/github/coreofscience/python-wostools/
-     :alt: Updates
-
-.. image:: https://img.shields.io/pypi/v/wostools.svg
-    :target: https://pypi.python.org/pypi/wostools
-
-.. image:: https://img.shields.io/travis/coreofscience/python-wostools.svg
-    :target: https://travis-ci.org/coreofscience/python-wostools
-
-.. image:: https://readthedocs.org/projects/python-wostools/badge/?version=latest
-    :target: https://python-wostools.readthedocs.io/en/latest/?badge=latest
-    :alt: Documentation Status
-
-.. image:: https://zenodo.org/badge/94160457.svg
-   :target: https://zenodo.org/badge/latestdoi/94160457
-   :alt: DOI: 10.5281/zenodo.1344261
-
-Translates ISI Web of Knowledge files into python objects.
-
-
-
-* Free software: MIT license
-* Documentation: https://python-wostools.readthedocs.io.
-
-
-Quickstart
-----------
-
-Install the library by:
-
-.. code-block:: bash
-
-   $ pip install wostools
-
-Say you want to grab the title of all the articles in an ISI file, you can grab
-`this example file`_.
-
-.. code-block:: python
-
-   >>> from wostools import CollectionLazy
-   >>> collection = CollectionLazy.from_filenames('docs/examples/bit-pattern-savedrecs.txt')
-   >>> for article in collection.articles:
-   ...     print(article.TI)
-   In situ grazing incidence small-angle X-ray scattering study of solvent vapor annealing in lamellae-forming block copolymer thin films: Trade-off of defects in deswelling
-   Structural control of ultra-fine CoPt nanodot arrays via electrodeposition process
-   Porphyrin-based Pt/Pd-containing metallopolymers: Synthesis, characterization, optical property and potential application in bioimaging
-   Syntheses and Controllable Self-Assembly of Luminescence Platinum(II) Plane-Coil Diblock Copolymers
-   # ...
-
-Never fear wostools cli is here. To help you do some common tasks right from
-your terminal.
-
-.. code-block:: bash
-
-   $ wostools --help
-   $ # To extract all the properties in a json file
-   $ wostools to-json docs/examples/bit-pattern-savedrecs.txt --output=document.json
-
-Features
---------
-
-* Just parses an ISI Web of Knowledge file and produces a native python object.
-* Through the :code:`CollectionLazy` object it can do this using the minimum
-  amount of memory it can possibly do.
-* It has a cli to extract documents and citation pairs for you :smile:
-
-Credits
--------
-
-This package was created with Cookiecutter_ and the `audreyr/cookiecutter-pypackage`_ project template.
-
-.. _Cookiecutter: https://github.com/audreyr/cookiecutter
-.. _`audreyr/cookiecutter-pypackage`: https://github.com/audreyr/cookiecutter-pypackage
-.. _`this example file`: docs/examples/bit-pattern-savedrecs.txt
diff --git a/docs/Makefile b/docs/Makefile
deleted file mode 100644
index ed385fe..0000000
--- a/docs/Makefile
+++ /dev/null
@@ -1,20 +0,0 @@
-# Minimal makefile for Sphinx documentation
-#
-
-# You can set these variables from the command line.
-SPHINXOPTS    =
-SPHINXBUILD   = python -msphinx
-SPHINXPROJ    = wostools
-SOURCEDIR     = .
-BUILDDIR      = _build
-
-# Put it first so that "make" without argument is like "make help".
-help:
-	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
-
-.PHONY: help Makefile
-
-# Catch-all target: route all unknown targets to Sphinx using the new
-# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
-%: Makefile
-	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/_static/.gitkeep b/docs/_static/.gitkeep
deleted file mode 100644
index e69de29..0000000
diff --git a/docs/authors.rst b/docs/authors.rst
deleted file mode 100644
index e122f91..0000000
--- a/docs/authors.rst
+++ /dev/null
@@ -1 +0,0 @@
-.. include:: ../AUTHORS.rst
diff --git a/docs/conf.py b/docs/conf.py
deleted file mode 100644
index a5e3a47..0000000
--- a/docs/conf.py
+++ /dev/null
@@ -1,160 +0,0 @@
-#!/usr/bin/env python
-#
-# wostools documentation build configuration file, created by
-# sphinx-quickstart on Fri Jun  9 13:47:02 2017.
-#
-# This file is execfile()d with the current directory set to its
-# containing dir.
-#
-# Note that not all possible configuration values are present in this
-# autogenerated file.
-#
-# All configuration values have a default; values that are commented out
-# serve to show the default.
-
-# If extensions (or modules to document with autodoc) are in another
-# directory, add these directories to sys.path here. If the directory is
-# relative to the documentation root, use os.path.abspath to make it
-# absolute, like shown here.
-#
-import os
-import sys
-
-sys.path.insert(0, os.path.abspath(".."))
-
-import wostools
-
-# -- General configuration ---------------------------------------------
-
-# If your documentation needs a minimal Sphinx version, state it here.
-#
-# needs_sphinx = '1.0'
-
-# Add any Sphinx extension module names here, as strings. They can be
-# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
-extensions = ["sphinx.ext.autodoc", "sphinx.ext.viewcode"]
-
-# Add any paths that contain templates here, relative to this directory.
-templates_path = ["_templates"]
-
-# The suffix(es) of source filenames.
-# You can specify multiple suffix as a list of string:
-#
-# source_suffix = ['.rst', '.md']
-source_suffix = ".rst"
-
-# The master toctree document.
-master_doc = "index"
-
-# General information about the project.
-project = u"Python WoS tools"
-copyright = u"2018, Core of Science"
-author = u"Core of Science"
-
-# The version info for the project you're documenting, acts as replacement
-# for |version| and |release|, also used in various other places throughout
-# the built documents.
-#
-# The short X.Y version.
-version = wostools.__version__
-# The full version, including alpha/beta/rc tags.
-release = wostools.__version__
-
-# The language for content autogenerated by Sphinx. Refer to documentation
-# for a list of supported languages.
-#
-# This is also used if you do content translation via gettext catalogs.
-# Usually you set "language" from the command line for these cases.
-language = None
-
-# List of patterns, relative to source directory, that match files and
-# directories to ignore when looking for source files.
-# This patterns also effect to html_static_path and html_extra_path
-exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
-
-# The name of the Pygments (syntax highlighting) style to use.
-pygments_style = "sphinx"
-
-# If true, `todo` and `todoList` produce output, else they produce nothing.
-todo_include_todos = False
-
-
-# -- Options for HTML output -------------------------------------------
-
-# The theme to use for HTML and HTML Help pages.  See the documentation for
-# a list of builtin themes.
-#
-html_theme = "alabaster"
-
-# Theme options are theme-specific and customize the look and feel of a
-# theme further.  For a list of options available for each theme, see the
-# documentation.
-#
-# html_theme_options = {}
-
-# Add any paths that contain custom static files (such as style sheets) here,
-# relative to this directory. They are copied after the builtin static files,
-# so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ["_static"]
-
-
-# -- Options for HTMLHelp output ---------------------------------------
-
-# Output file base name for HTML help builder.
-htmlhelp_basename = "wostoolsdoc"
-
-
-# -- Options for LaTeX output ------------------------------------------
-
-latex_elements = {
-    # The paper size ('letterpaper' or 'a4paper').
-    #
-    # 'papersize': 'letterpaper',
-    # The font size ('10pt', '11pt' or '12pt').
-    #
-    # 'pointsize': '10pt',
-    # Additional stuff for the LaTeX preamble.
-    #
-    # 'preamble': '',
-    # Latex figure (float) alignment
-    #
-    # 'figure_align': 'htbp',
-}
-
-# Grouping the document tree into LaTeX files. List of tuples
-# (source start file, target name, title, author, documentclass
-# [howto, manual, or own class]).
-latex_documents = [
-    (
-        master_doc,
-        "wostools.tex",
-        u"Python WoS tools Documentation",
-        u"Core of Science",
-        "manual",
-    )
-]
-
-
-# -- Options for manual page output ------------------------------------
-
-# One entry per manual page. List of tuples
-# (source start file, name, description, authors, manual section).
-man_pages = [(master_doc, "wostools", u"Python WoS tools Documentation", [author], 1)]
-
-
-# -- Options for Texinfo output ----------------------------------------
-
-# Grouping the document tree into Texinfo files. List of tuples
-# (source start file, target name, title, author,
-#  dir menu entry, description, category)
-texinfo_documents = [
-    (
-        master_doc,
-        "wostools",
-        u"Python WoS tools Documentation",
-        author,
-        "wostools",
-        "One line description of project.",
-        "Miscellaneous",
-    )
-]
diff --git a/docs/contributing.rst b/docs/contributing.rst
deleted file mode 100644
index e582053..0000000
--- a/docs/contributing.rst
+++ /dev/null
@@ -1 +0,0 @@
-.. include:: ../CONTRIBUTING.rst
diff --git a/docs/history.rst b/docs/history.rst
deleted file mode 100644
index 2506499..0000000
--- a/docs/history.rst
+++ /dev/null
@@ -1 +0,0 @@
-.. include:: ../HISTORY.rst
diff --git a/docs/index.rst b/docs/index.rst
deleted file mode 100644
index a157044..0000000
--- a/docs/index.rst
+++ /dev/null
@@ -1,20 +0,0 @@
-Welcome to Python WoS tools's documentation!
-============================================
-
-.. toctree::
-   :maxdepth: 2
-   :caption: Contents:
-
-   readme
-   installation
-   usage
-   modules
-   contributing
-   authors
-   history
-
-Indices and tables
-==================
-* :ref:`genindex`
-* :ref:`modindex`
-* :ref:`search`
diff --git a/docs/installation.rst b/docs/installation.rst
deleted file mode 100644
index 64f416c..0000000
--- a/docs/installation.rst
+++ /dev/null
@@ -1,51 +0,0 @@
-.. highlight:: shell
-
-============
-Installation
-============
-
-
-Stable release
---------------
-
-To install Python WoS tools, run this command in your terminal:
-
-.. code-block:: console
-
-    $ pip install wostools
-
-This is the preferred method to install Python WoS tools, as it will always install the most recent stable release.
-
-If you don't have `pip`_ installed, this `Python installation guide`_ can guide
-you through the process.
-
-.. _pip: https://pip.pypa.io
-.. _Python installation guide: http://docs.python-guide.org/en/latest/starting/installation/
-
-
-From sources
-------------
-
-The sources for Python WoS tools can be downloaded from the `Github repo`_.
-
-You can either clone the public repository:
-
-.. code-block:: console
-
-    $ git clone git://github.com/coreofscience/python-wostools
-
-Or download the `tarball`_:
-
-.. code-block:: console
-
-    $ curl  -OL https://github.com/coreofscience/python-wostools/tarball/master
-
-Once you have a copy of the source, you can install it with:
-
-.. code-block:: console
-
-    $ python setup.py install
-
-
-.. _Github repo: https://github.com/coreofscience/python-wostools
-.. _tarball: https://github.com/coreofscience/python-wostools/tarball/master
diff --git a/docs/make.bat b/docs/make.bat
deleted file mode 100644
index d9d9628..0000000
--- a/docs/make.bat
+++ /dev/null
@@ -1,36 +0,0 @@
-@ECHO OFF
-
-pushd %~dp0
-
-REM Command file for Sphinx documentation
-
-if "%SPHINXBUILD%" == "" (
-	set SPHINXBUILD=python -msphinx
-)
-set SOURCEDIR=.
-set BUILDDIR=_build
-set SPHINXPROJ=wostools
-
-if "%1" == "" goto help
-
-%SPHINXBUILD% >NUL 2>NUL
-if errorlevel 9009 (
-	echo.
-	echo.The Sphinx module was not found. Make sure you have Sphinx installed,
-	echo.then set the SPHINXBUILD environment variable to point to the full
-	echo.path of the 'sphinx-build' executable. Alternatively you may add the
-	echo.Sphinx directory to PATH.
-	echo.
-	echo.If you don't have Sphinx installed, grab it from
-	echo.http://sphinx-doc.org/
-	exit /b 1
-)
-
-%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
-goto end
-
-:help
-%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
-
-:end
-popd
diff --git a/docs/modules.rst b/docs/modules.rst
deleted file mode 100644
index e69dabc..0000000
--- a/docs/modules.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-wostools
-========
-
-.. toctree::
-   :maxdepth: 4
-
-   wostools
diff --git a/docs/readme.rst b/docs/readme.rst
deleted file mode 100644
index 72a3355..0000000
--- a/docs/readme.rst
+++ /dev/null
@@ -1 +0,0 @@
-.. include:: ../README.rst
diff --git a/docs/usage.rst b/docs/usage.rst
deleted file mode 100644
index 2a60eb4..0000000
--- a/docs/usage.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-=====
-Usage
-=====
-
-To use Python WoS tools in a project::
-
-    import wostools
diff --git a/docs/wostools.rst b/docs/wostools.rst
deleted file mode 100644
index b45ba29..0000000
--- a/docs/wostools.rst
+++ /dev/null
@@ -1,54 +0,0 @@
-wostools package
-================
-
-Submodules
-----------
-
-wostools.article module
------------------------
-
-.. automodule:: wostools.article
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-wostools.cached module
-----------------------
-
-.. automodule:: wostools.cached
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-wostools.cli module
--------------------
-
-.. automodule:: wostools.cli
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-wostools.fields module
-----------------------
-
-.. automodule:: wostools.fields
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-wostools.lazy module
---------------------
-
-.. automodule:: wostools.lazy
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-
-Module contents
----------------
-
-.. automodule:: wostools
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/requirements_dev.txt b/requirements_dev.txt
index e725414..0a1ced1 100644
--- a/requirements_dev.txt
+++ b/requirements_dev.txt
@@ -2,6 +2,7 @@ bumpversion==0.5.3
 flake8==3.7.8
 coverage==4.5.4
 Sphinx==2.2.0
+recommonmark==0.6.0
 
 pytest==5.2.1
 pytest-runner==5.1
diff --git a/setup.py b/setup.py
index 9a53733..91047da 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@
 
 from setuptools import setup, find_packages
 
-with open("README.rst") as readme_file:
+with open("README.md") as readme_file:
     readme = readme_file.read()
 
 with open("HISTORY.rst") as history_file:

From cd8da7822f1cb43583251beb7281454d2865d076 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oscar=20Arbel=C3=A1ez?= <odarbelaeze@gmail.com>
Date: Fri, 5 Jun 2020 12:06:47 -0500
Subject: [PATCH 06/35] Some docs do not apply anymore

---
 MANIFEST.in          | 2 +-
 requirements_dev.txt | 2 --
 setup.py             | 3 ++-
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/MANIFEST.in b/MANIFEST.in
index 68e47f0..a93c75c 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -8,4 +8,4 @@ recursive-include tests *
 recursive-exclude * __pycache__
 recursive-exclude * *.py[co]
 
-recursive-include docs *.md conf.py Makefile make.bat *.jpg *.png *.gif
+recursive-include docs *.txt
diff --git a/requirements_dev.txt b/requirements_dev.txt
index 0a1ced1..99a0f30 100644
--- a/requirements_dev.txt
+++ b/requirements_dev.txt
@@ -1,8 +1,6 @@
 bumpversion==0.5.3
 flake8==3.7.8
 coverage==4.5.4
-Sphinx==2.2.0
-recommonmark==0.6.0
 
 pytest==5.2.1
 pytest-runner==5.1
diff --git a/setup.py b/setup.py
index 91047da..f5c95fb 100644
--- a/setup.py
+++ b/setup.py
@@ -7,7 +7,7 @@
 with open("README.md") as readme_file:
     readme = readme_file.read()
 
-with open("HISTORY.rst") as history_file:
+with open("HISTORY.md") as history_file:
     history = history_file.read()
 
 requirements = ["Click>=7.0"]
@@ -43,4 +43,5 @@
     url="https://github.com/coreofscience/python-wostools",
     version="1.1.0",
     zip_safe=False,
+    long_description_content_type="text/markdown",
 )

From 32ec7e4a7edd9ff41e5063bcbe8f1d9ef5db2884 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oscar=20Arbel=C3=A1ez?= <odarbelaeze@gmail.com>
Date: Fri, 5 Jun 2020 12:07:35 -0500
Subject: [PATCH 07/35] Some config does not apply anymore

---
 .flake8     | 5 -----
 .flake8.ini | 5 -----
 setup.cfg   | 4 ++++
 3 files changed, 4 insertions(+), 10 deletions(-)
 delete mode 100644 .flake8
 delete mode 100644 .flake8.ini

diff --git a/.flake8 b/.flake8
deleted file mode 100644
index f8cc7ae..0000000
--- a/.flake8
+++ /dev/null
@@ -1,5 +0,0 @@
-[flake8]
-ignore = E203, E266, E501, W503
-max-line-length = 89
-max-complexity = 18
-select = B,C,E,F,W,T4,B9
\ No newline at end of file
diff --git a/.flake8.ini b/.flake8.ini
deleted file mode 100644
index f8cc7ae..0000000
--- a/.flake8.ini
+++ /dev/null
@@ -1,5 +0,0 @@
-[flake8]
-ignore = E203, E266, E501, W503
-max-line-length = 89
-max-complexity = 18
-select = B,C,E,F,W,T4,B9
\ No newline at end of file
diff --git a/setup.cfg b/setup.cfg
index a5d6c66..c5e381a 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -20,6 +20,10 @@ universal = 1
 
 [flake8]
 exclude = docs
+ignore = E203, E266, E501, W503
+max-line-length = 89
+max-complexity = 18
+select = B,C,E,F,W,T4,B9
 
 [aliases]
 test = pytest

From 9a1341668d55e20670a6a43f55c03889e1cd7b3e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oscar=20Arbel=C3=A1ez?= <odarbelaeze@gmail.com>
Date: Fri, 5 Jun 2020 12:10:51 -0500
Subject: [PATCH 08/35] Update readme a bit

---
 README.md | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index c84b228..0d5580d 100644
--- a/README.md
+++ b/README.md
@@ -1,15 +1,11 @@
 # Python WoS tools
 
-[![Updates](https://pyup.io/repos/github/coreofscience/python-wostools/shield.svg)](https://pyup.io/repos/github/coreofscience/python-wostools/)
-
+![Python package](https://github.com/coreofscience/python-wostools/workflows/Python%20package/badge.svg)
 [![image](https://img.shields.io/pypi/v/wostools.svg)](https://pypi.python.org/pypi/wostools)
-
 [![DOI: 10.5281/zenodo.1344261](https://zenodo.org/badge/94160457.svg)](https://zenodo.org/badge/latestdoi/94160457)
 
 Translates ISI Web of Knowledge files into python objects.
 
--   Free software: MIT license
-
 ## Quickstart
 
 Install the library by:
@@ -44,6 +40,7 @@ $ wostools to-json docs/examples/bit-pattern-savedrecs.txt --output=document.jso
 
 ## Features
 
+-   Free software: MIT license
 -   Just parses an ISI Web of Knowledge file and produces a native
     python object.
 -   Through the `CollectionLazy` object it can do this using the minimum

From 44e0c969474593c5e9e34f39d5a19c7ea83af19e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oscar=20Arbel=C3=A1ez?= <odarbelaeze@gmail.com>
Date: Sun, 7 Jun 2020 14:47:29 -0500
Subject: [PATCH 09/35] Do a better job of exception handling

---
 wostools/article.py    | 18 +++++++++++-------
 wostools/cached.py     |  3 ++-
 wostools/exceptions.py | 32 ++++++++++++++++++++++++++++++++
 wostools/lazy.py       |  5 +++--
 4 files changed, 48 insertions(+), 10 deletions(-)
 create mode 100644 wostools/exceptions.py

diff --git a/wostools/article.py b/wostools/article.py
index 8726119..4f4ffac 100644
--- a/wostools/article.py
+++ b/wostools/article.py
@@ -4,6 +4,7 @@
 from typing import Any, List, Mapping, Optional, Set
 
 from wostools.fields import parse_all
+from wostools.exceptions import InvalidReference, InvalidIsiLine
 
 logger = logging.getLogger(__name__)
 
@@ -33,11 +34,13 @@ def __init__(
         page: Optional[str] = None,
         doi: Optional[str] = None,
         references: Optional[List[str]] = None,
+        keywords: Optional[List[str]] = None,
         sources: Optional[Set[str]] = None,
         extra: Optional[Mapping] = None,
     ):
         self.title: Optional[str] = title
         self.authors: List[str] = authors
+        self.keywords: List[str] = keywords or []
         self.year: Optional[int] = year
         self.journal: Optional[str] = journal
         self.volume: Optional[str] = volume
@@ -50,7 +53,7 @@ def __init__(
     @property
     def label(self):
         if not (self.authors and self.year and self.journal):
-            raise ValueError("Missing required fields for label")
+            raise ValueError(self)
         pieces = {
             "AU": self.authors[0].replace(",", ""),
             "PY": str(self.year),
@@ -77,7 +80,7 @@ def to_dict(self, simplified=True):
         return {
             "title": self.title,
             "authors": self.authors,
-            "keywords": self.extra.get("keywords", []),
+            "keywords": self.keywords,
             "year": self.year,
             "journal": self.journal,
             "volume": self.volume,
@@ -110,7 +113,7 @@ def from_isi_text(cls, raw: str) -> "Article":
         for line in raw.split("\n"):
             match = ISI_LINE_PATTERN.match(line)
             if not match:
-                raise ValueError(f"'{line}' is not a valid ISI file line")
+                raise InvalidIsiLine(line)
             parsed = match.groupdict()
             field = parsed.get("field") or field
             if not field or "value" not in parsed or parsed["value"] is None:
@@ -126,15 +129,16 @@ def from_isi_text(cls, raw: str) -> "Article":
             page=processed.get("beginning_page"),
             doi=processed.get("DOI"),
             references=processed.get("references"),
+            keywords=processed.get("keywords"),
             extra=processed,
             sources={raw},
         )
 
     @classmethod
-    def from_isi_citation(cls, citation: str) -> "Article":
-        match = ISI_CITATION_PATTERN.match(citation)
+    def from_isi_citation(cls, reference: str) -> "Article":
+        match = ISI_CITATION_PATTERN.match(reference)
         if not match:
-            raise ValueError(f"{citation} does not look like an ISI citation")
+            raise InvalidReference(reference)
         data = {key: [value] for key, value in match.groupdict().items() if value}
         processed = parse_all(data)
         return cls(
@@ -146,5 +150,5 @@ def from_isi_citation(cls, citation: str) -> "Article":
             page=processed.get("beginning_page"),
             doi=processed.get("DOI"),
             extra=processed,
-            sources={citation},
+            sources={reference},
         )
diff --git a/wostools/cached.py b/wostools/cached.py
index a025823..6d3102c 100644
--- a/wostools/cached.py
+++ b/wostools/cached.py
@@ -8,6 +8,7 @@
 from typing import Dict, Iterable, Tuple
 
 from wostools.article import Article
+from wostools.exceptions import InvalidReference
 
 logger = logging.getLogger(__name__)
 
@@ -41,7 +42,7 @@ def _preheat(self):
             for reference in article.references:
                 try:
                     self._add_article(Article.from_isi_citation(reference))
-                except ValueError:
+                except InvalidReference:
                     logger.info(
                         f"Ignoring malformed reference '{reference}' from '{article.label}'"
                     )
diff --git a/wostools/exceptions.py b/wostools/exceptions.py
new file mode 100644
index 0000000..cf059e3
--- /dev/null
+++ b/wostools/exceptions.py
@@ -0,0 +1,32 @@
+class WosToolsError(Exception):
+    """
+    Any exception known by wostools.
+    """
+
+
+class InvalidReference(WosToolsError, ValueError):
+    """
+    Raised when we try to create an article out of an invalid reference.
+    """
+
+    def __init__(self, reference: str):
+        super().__init__(f"{reference} does not look like an ISI citation")
+
+
+class InvalidIsiLine(WosToolsError, ValueError):
+    """
+    Raised when we encounter an invalid line when processing an ISI file.
+    """
+
+    def __init__(self, line: str):
+        super().__init__(f"'{line}' is not a valid ISI file line")
+
+
+class MissingLabelFields(WosToolsError, ValueError):
+    """
+    Raised when we don't have any of the required fields for an ISI reference.
+    """
+
+    def __init__(self, article, message: str = None):
+        self.article = article
+        super().__init__(message or "Missing required fields for label")
diff --git a/wostools/lazy.py b/wostools/lazy.py
index 8b1deff..ed87109 100644
--- a/wostools/lazy.py
+++ b/wostools/lazy.py
@@ -8,6 +8,7 @@
 from typing import Iterable, Tuple
 
 from wostools.article import Article
+from wostools.exceptions import InvalidReference
 
 logger = logging.getLogger(__name__)
 
@@ -83,7 +84,7 @@ def articles(self) -> Iterable[Article]:
             for reference in article.references:
                 try:
                     yield Article.from_isi_citation(reference)
-                except ValueError:
+                except InvalidReference:
                     logger.info(
                         f"Ignoring malformed reference '{reference}' from '{article.label}'"
                     )
@@ -128,7 +129,7 @@ def citation_pairs(self) -> Iterable[Tuple[Article, Article]]:
             for reference in article.references:
                 try:
                     yield (article, Article.from_isi_citation(reference))
-                except ValueError:
+                except InvalidReference:
                     logger.info(
                         f"Ignoring malformed reference '{reference}' from '{article.label}'"
                     )

From 0d91ba4d66add28c4f517a86cf092767287b5c81 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oscar=20Arbel=C3=A1ez?= <odarbelaeze@gmail.com>
Date: Sun, 7 Jun 2020 15:29:24 -0500
Subject: [PATCH 10/35] Update the good old names

---
 README.md              |   6 +-
 tests/conftest.py      |   6 +-
 tests/test_wostools.py |  32 ++++++-----
 wostools/__init__.py   |   6 +-
 wostools/base.py       | 125 +++++++++++++++++++++++++++++++++++++++++
 wostools/cached.py     |  72 ++++--------------------
 wostools/cli.py        |   8 +--
 wostools/lazy.py       |  62 ++------------------
 8 files changed, 172 insertions(+), 145 deletions(-)
 create mode 100644 wostools/base.py

diff --git a/README.md b/README.md
index 0d5580d..4223c5b 100644
--- a/README.md
+++ b/README.md
@@ -18,9 +18,9 @@ Say you want to grab the title of all the articles in an ISI file, you
 can grab [this example file](docs/examples/bit-pattern-savedrecs.txt).
 
 ```python
->>> from wostools import Collection
->>> collection = Collection.from_filenames('docs/examples/bit-pattern-savedrecs.txt')
->>> for article in collection.articles:
+>>> from wostools import CachedCollection
+>>> collection = CachedCollection.from_filenames('docs/examples/bit-pattern-savedrecs.txt')
+>>> for article in collection:
 ...     print(article.title)
 In situ grazing incidence small-angle X-ray scattering study of solvent vapor annealing in lamellae-forming block copolymer thin films: Trade-off of defects in deswelling
 Structural control of ultra-fine CoPt nanodot arrays via electrodeposition process
diff --git a/tests/conftest.py b/tests/conftest.py
index fa0db46..64a4386 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -2,7 +2,7 @@
 Configuration file for python-wostools tests.
 """
 
-from wostools import Article, CollectionLazy, Collection
+from wostools import Article, LazyCollection, CachedCollection
 
 import pytest
 import io
@@ -104,11 +104,11 @@ def filename_many_documents():
     return "docs/examples/bit-pattern-savedrecs.txt"
 
 
-@pytest.fixture(params=[Collection, CollectionLazy])
+@pytest.fixture(params=[CachedCollection, LazyCollection])
 def collection_single_document(request, filename_single_document):
     return request.param.from_filenames(filename_single_document)
 
 
-@pytest.fixture(params=[Collection, CollectionLazy])
+@pytest.fixture(params=[CachedCollection, LazyCollection])
 def collection_many_documents(request, filename_many_documents):
     return request.param.from_filenames(filename_many_documents)
diff --git a/tests/test_wostools.py b/tests/test_wostools.py
index 565af9d..9cd307f 100644
--- a/tests/test_wostools.py
+++ b/tests/test_wostools.py
@@ -2,7 +2,7 @@
 
 from click.testing import CliRunner
 
-from wostools import CollectionLazy, Collection
+from wostools import LazyCollection, CachedCollection
 from wostools import cli
 from wostools import Article
 import pytest
@@ -158,7 +158,7 @@ def test_article_properties(article):
 
 
 def test_collection_from_filenames(collection_many_documents):
-    for article in collection_many_documents.articles:
+    for article in collection_many_documents:
         assert isinstance(article, Article)
 
     for file in collection_many_documents._files:
@@ -167,13 +167,15 @@ def test_collection_from_filenames(collection_many_documents):
         assert file.tell() == 0
 
 
-@pytest.mark.parametrize("cls,count", [(CollectionLazy, 13892), (Collection, 8797)])
+@pytest.mark.parametrize(
+    "cls,count", [(LazyCollection, 13892), (CachedCollection, 8797)]
+)
 def test_collection_from_glob(cls, count):
     collection = cls.from_glob("docs/examples/*.txt")
-    for article in collection.articles:
+    for article in collection:
         assert isinstance(article, Article)
 
-    assert len(list(collection.articles)) == count
+    assert len(list(collection)) == count
 
     for file in collection._files:
         assert hasattr(file, "read")
@@ -185,8 +187,8 @@ def test_collection_from_streams(filename_single_document):
     with open(filename_single_document) as file:
         _ = file.read()
 
-        collection = CollectionLazy(file)
-        for article in collection.articles:
+        collection = LazyCollection(file)
+        for article in collection:
             assert isinstance(article, Article)
 
         for file in collection._files:
@@ -196,29 +198,29 @@ def test_collection_from_streams(filename_single_document):
 
 
 def test_collection_with_duplicated(filename_single_document, filename_many_documents):
-    collection = CollectionLazy.from_filenames(filename_single_document)
+    collection = LazyCollection.from_filenames(filename_single_document)
     assert len(list(collection._files)) == 1
-    assert len(list(collection.articles)) == 29
+    assert len(list(collection)) == 29
 
-    collection = CollectionLazy.from_filenames(
+    collection = LazyCollection.from_filenames(
         filename_single_document, filename_single_document, filename_single_document
     )
     assert len(list(collection._files)) == 3
-    assert len(list(collection.articles)) == 3 * 29
+    assert len(list(collection)) == 3 * 29
 
 
 def test_cached_collection_with_duplicated(
     filename_single_document, filename_many_documents
 ):
-    collection = Collection.from_filenames(filename_single_document)
+    collection = CachedCollection.from_filenames(filename_single_document)
     assert len(list(collection._files)) == 1
-    assert len(list(collection.articles)) == 29
+    assert len(list(collection)) == 29
 
-    collection = Collection.from_filenames(
+    collection = CachedCollection.from_filenames(
         filename_single_document, filename_single_document
     )
     assert len(list(collection._files)) == 2
-    assert len(list(collection.articles)) == 29
+    assert len(list(collection)) == 29
 
 
 def test_collection_authors(collection_single_document):
diff --git a/wostools/__init__.py b/wostools/__init__.py
index 75ebed7..ca3af1c 100644
--- a/wostools/__init__.py
+++ b/wostools/__init__.py
@@ -5,7 +5,7 @@
 __version__ = "1.1.0"
 
 from wostools.article import Article
-from wostools.lazy import CollectionLazy
-from wostools.cached import CollectionCached as Collection
+from wostools.lazy import LazyCollection
+from wostools.cached import CachedCollection
 
-__all__ = ["Collection", "CollectionLazy", "Article"]
+__all__ = ["CachedCollection", "LazyCollection", "Article"]
diff --git a/wostools/base.py b/wostools/base.py
new file mode 100644
index 0000000..ea48f56
--- /dev/null
+++ b/wostools/base.py
@@ -0,0 +1,125 @@
+"""
+Base collection for a shared API.
+"""
+
+import glob
+import logging
+from typing import Iterable, Iterator, Tuple
+
+from wostools.article import Article
+from wostools.exceptions import InvalidReference
+
+logger = logging.getLogger(__name__)
+
+
+class BaseCollection(object):
+    """
+    A collection of WOS text files.
+    """
+
+    def __init__(self, *files):
+        self._files = files
+        for file in self._files:
+            file.seek(0)
+
+    @classmethod
+    def from_glob(cls, pattern):
+        """Creates a new collection from a pattern using glob.
+
+        Args:
+            pattern (str): String with the pattern to be passed to glob.
+
+        Returns:
+            CollectionLazy: Collection with the articles by using the pattern.
+        """
+        return cls.from_filenames(*glob.glob(pattern))
+
+    @classmethod
+    def from_filenames(cls, *filenames):
+        """Creates a new collection from a list of filenames.
+
+        Args:
+            filenames (str): String with the filename.
+
+        Returns:
+            CollectionLazy: Collection with the articles by reading the
+                filenames.
+        """
+        files = [open(filename, encoding="utf-8-sig") for filename in filenames]
+        return cls(*files)
+
+    @property
+    def _article_texts(self) -> Iterable[str]:
+        """Iterates over all the single article texts in the colection.
+
+        Returns:
+            generator: A generator of strings with the text articles.
+        """
+        for filehandle in self._files:
+            filehandle.seek(0)
+            data = filehandle.read()
+            filehandle.seek(0)
+            for article_text in data.split("\n\n"):
+                if article_text != "EF":
+                    yield article_text
+
+    def _articles(self) -> Iterable[Article]:
+        """
+        Should iterate over all the articles in the ISI file, excluding references.
+        """
+        raise NotImplementedError(
+            "Sub classes should know how to iterate over articles"
+        )
+
+    def __iter__(self) -> Iterator[Article]:
+        """
+        Should iterate over all articles known in the collection.
+        """
+        for article in self._articles():
+            yield article
+            for reference in article.references:
+                try:
+                    yield Article.from_isi_citation(reference)
+                except InvalidReference:
+                    logger.info(
+                        f"Ignoring malformed reference '{reference}' from '{article.label}'"
+                    )
+
+    def __len__(self):
+        return sum(1 for _ in self)
+
+    @property
+    def authors(self) -> Iterable[str]:
+        """Iterates over all article authors, including duplicates
+
+        Returns:
+            generator: A generator with the authors (one by one) of the
+                articles in the collection.
+        """
+        raise NotImplementedError("Sub classes should know how to iterate over authors")
+
+    @property
+    def coauthors(self) -> Iterable[Tuple[str, str]]:
+        """Iterates over coauthor pairs.
+
+        Returns:
+            generator: A generator with the pair of coauthors of the articles
+            in the collections.
+        """
+        raise NotImplementedError(
+            "Sub classes should know how to iterate over coauthors"
+        )
+
+    @property
+    def citation_pairs(self) -> Iterable[Tuple[Article, Article]]:
+        """
+        Computes the citation pairs for the articles in the collection.
+
+        Returns:
+            genertator: A generator with the citation links: pairs of article
+            labesl, where the firts element is the article which cites the
+            second element.
+        """
+        raise NotImplementedError(
+            "Sub classes should know how to iterate over citation pairs"
+        )
diff --git a/wostools/cached.py b/wostools/cached.py
index 6d3102c..ac72670 100644
--- a/wostools/cached.py
+++ b/wostools/cached.py
@@ -2,30 +2,32 @@
 Collection with a nice cache.
 """
 
-import glob
 import itertools
 import logging
-from typing import Dict, Iterable, Tuple
+from typing import Dict, Iterable, Iterator, Tuple
 
 from wostools.article import Article
+from wostools.base import BaseCollection
 from wostools.exceptions import InvalidReference
 
 logger = logging.getLogger(__name__)
 
 
-class CollectionCached(object):
+class CachedCollection(BaseCollection):
     """
     A collection of WOS text files.
     """
 
     def __init__(self, *files):
-        self._files = files
-        for file in self._files:
-            file.seek(0)
+        super().__init__(*files)
         self._cache_key = None
         self._cache: Dict[str, Article] = {}
         self._preheat()
 
+    def _articles(self) -> Iterable[Article]:
+        for article_text in self._article_texts:
+            yield Article.from_isi_text(article_text)
+
     def _add_article(self, article):
         label = article.label
         if label in self._cache:
@@ -37,7 +39,7 @@ def _preheat(self):
         key = ":".join(str(id(file) for file in self._files))
         if key == self._cache_key:
             return
-        for article in self._articles:
+        for article in self._articles():
             self._add_article(article)
             for reference in article.references:
                 try:
@@ -48,54 +50,7 @@ def _preheat(self):
                     )
         self._cache_key = key
 
-    @classmethod
-    def from_glob(cls, pattern):
-        """Creates a new collection from a pattern using glob.
-
-        Args:
-            pattern (str): String with the pattern to be passed to glob.
-
-        Returns:
-            CollectionLazy: Collection with the articles by using the pattern.
-        """
-        return cls.from_filenames(*glob.glob(pattern))
-
-    @classmethod
-    def from_filenames(cls, *filenames):
-        """Creates a new collection from a list of filenames.
-
-        Args:
-            filenames (str): String with the filename.
-
-        Returns:
-            CollectionLazy: Collection with the articles by reading the
-                filenames.
-        """
-        files = [open(filename, encoding="utf-8-sig") for filename in filenames]
-        return cls(*files)
-
-    @property
-    def _article_texts(self) -> Iterable[str]:
-        """Iterates over all the single article texts in the colection.
-
-        Returns:
-            generator: A generator of strings with the text articles.
-        """
-        for filehandle in self._files:
-            filehandle.seek(0)
-            data = filehandle.read()
-            filehandle.seek(0)
-            for article_text in data.split("\n\n"):
-                if article_text != "EF":
-                    yield article_text
-
-    @property
-    def _articles(self) -> Iterable[Article]:
-        for article_text in self._article_texts:
-            yield Article.from_isi_text(article_text)
-
-    @property
-    def articles(self) -> Iterable[Article]:
+    def __iter__(self) -> Iterator[Article]:
         """Iterates over all articles.
 
         Returns:
@@ -104,9 +59,6 @@ def articles(self) -> Iterable[Article]:
         self._preheat()
         yield from self._cache.values()
 
-    def __len__(self):
-        return sum(1 for _ in self.articles)
-
     @property
     def authors(self) -> Iterable[str]:
         """Iterates over all article authors, including duplicates
@@ -115,7 +67,7 @@ def authors(self) -> Iterable[str]:
             generator: A generator with the authors (one by one) of the
                 articles in the collection.
         """
-        for article in self.articles:
+        for article in self:
             yield from article.authors
 
     @property
@@ -126,7 +78,7 @@ def coauthors(self) -> Iterable[Tuple[str, str]]:
             generator: A generator with the pair of coauthors of the articles
                 in the collections.
         """
-        for article in self._articles:
+        for article in self._articles():
             yield from (
                 (source, target)
                 for source, target in itertools.combinations(sorted(article.authors), 2)
diff --git a/wostools/cli.py b/wostools/cli.py
index fa4f6dc..2209f21 100644
--- a/wostools/cli.py
+++ b/wostools/cli.py
@@ -3,7 +3,7 @@
 
 import click
 
-from wostools import Collection
+from wostools import CachedCollection
 
 
 @click.group()
@@ -33,7 +33,7 @@ def citation_pairs(sources, output):
         click.secho("You should give at least a file with documents.", fg="red")
         return
 
-    collection = Collection.from_filenames(*[f.name for f in sources])
+    collection = CachedCollection.from_filenames(*[f.name for f in sources])
     pairs = [
         (source.label, target.label) for source, target in collection.citation_pairs()
     ]
@@ -67,9 +67,9 @@ def to_dict(sources, output, more):
         click.secho("You should give at least a file with documents.", fg="red")
         return
 
-    collection = Collection.from_filenames(*[f.name for f in sources])
+    collection = CachedCollection.from_filenames(*[f.name for f in sources])
     json.dump(
-        [article.to_dict(simplified=not more) for article in collection.articles],
+        [article.to_dict(simplified=not more) for article in collection],
         output,
         indent=2,
     )
diff --git a/wostools/lazy.py b/wostools/lazy.py
index ed87109..36f53d1 100644
--- a/wostools/lazy.py
+++ b/wostools/lazy.py
@@ -2,18 +2,18 @@
 The whole wostools thing.
 """
 
-import glob
 import itertools
 import logging
 from typing import Iterable, Tuple
 
 from wostools.article import Article
+from wostools.base import BaseCollection
 from wostools.exceptions import InvalidReference
 
 logger = logging.getLogger(__name__)
 
 
-class CollectionLazy(object):
+class LazyCollection(BaseCollection):
     """A collection of WOS text files.
 
     Args:
@@ -21,37 +21,6 @@ class CollectionLazy(object):
             articles.
     """
 
-    def __init__(self, *files):
-        self._files = files
-        for file in self._files:
-            file.seek(0)
-
-    @classmethod
-    def from_glob(cls, pattern):
-        """Creates a new collection from a pattern using glob.
-
-        Args:
-            pattern (str): String with the pattern to be passed to glob.
-
-        Returns:
-            CollectionLazy: Collection with the articles by using the pattern.
-        """
-        return cls.from_filenames(*glob.glob(pattern))
-
-    @classmethod
-    def from_filenames(cls, *filenames):
-        """Creates a new collection from a list of filenames.
-
-        Args:
-            filenames (str): String with the filename.
-
-        Returns:
-            CollectionLazy: Collection with the articles by reading the
-                filenames.
-        """
-        files = [open(filename, encoding="utf-8-sig") for filename in filenames]
-        return cls(*files)
-
     @property
     def _article_texts(self):
         """Iterates over all the single article texts in the colection.
@@ -67,31 +36,10 @@ def _article_texts(self):
                 if article_text != "EF":
                     yield article_text
 
-    @property
     def _articles(self) -> Iterable[Article]:
         for article_text in self._article_texts:
             yield Article.from_isi_text(article_text)
 
-    @property
-    def articles(self) -> Iterable[Article]:
-        """Iterates over all articles.
-
-        Returns:
-            generator: A generator of Articles according to the text articles.
-        """
-        for article in self._articles:
-            yield article
-            for reference in article.references:
-                try:
-                    yield Article.from_isi_citation(reference)
-                except InvalidReference:
-                    logger.info(
-                        f"Ignoring malformed reference '{reference}' from '{article.label}'"
-                    )
-
-    def __len__(self):
-        return sum(1 for _ in self.articles)
-
     @property
     def authors(self) -> Iterable[str]:
         """Iterates over all article authors, including duplicates
@@ -100,7 +48,7 @@ def authors(self) -> Iterable[str]:
             generator: A generator with the authors (one by one) of the
                 articles in the collection.
         """
-        for article in self.articles:
+        for article in self:
             yield from article.authors
 
     @property
@@ -111,7 +59,7 @@ def coauthors(self) -> Iterable[Tuple[str, str]]:
             generator: A generator with the pair of coauthors of the articles
                 in the collections.
         """
-        for article in self._articles:
+        for article in self._articles():
             yield from (
                 (source, target)
                 for source, target in itertools.combinations(sorted(article.authors), 2)
@@ -125,7 +73,7 @@ def citation_pairs(self) -> Iterable[Tuple[Article, Article]]:
             labesl, where the firts element is the article which cites the
             second element.
         """
-        for article in self._articles:
+        for article in self:
             for reference in article.references:
                 try:
                     yield (article, Article.from_isi_citation(reference))

From 60a9dcd017aac3a0726d3856368a4ef7483228d8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oscar=20Arbel=C3=A1ez?= <odarbelaeze@gmail.com>
Date: Sat, 4 Jul 2020 01:43:57 -0500
Subject: [PATCH 11/35] Start testing with pytest bdd

---
 requirements_dev.txt                          |   1 +
 setup.py                                      |   2 +-
 tests/{test_wostools.py => _test_wostools.py} |   0
 tests/features/article.feature                |  30 +++++
 tests/test_article.py                         | 112 ++++++++++++++++++
 5 files changed, 144 insertions(+), 1 deletion(-)
 rename tests/{test_wostools.py => _test_wostools.py} (100%)
 create mode 100644 tests/features/article.feature
 create mode 100644 tests/test_article.py

diff --git a/requirements_dev.txt b/requirements_dev.txt
index 5c0b518..8881d38 100644
--- a/requirements_dev.txt
+++ b/requirements_dev.txt
@@ -4,3 +4,4 @@ coverage==5.1
 
 pytest==5.4.3
 pytest-runner==5.2
+pytest-bdd==3.4.0
diff --git a/setup.py b/setup.py
index f5c95fb..c511c40 100644
--- a/setup.py
+++ b/setup.py
@@ -14,7 +14,7 @@
 
 setup_requirements = ["pytest-runner"]
 
-test_requirements = ["pytest"]
+test_requirements = ["pytest", "pytest-bdd"]
 
 setup(
     author="Core of Science",
diff --git a/tests/test_wostools.py b/tests/_test_wostools.py
similarity index 100%
rename from tests/test_wostools.py
rename to tests/_test_wostools.py
diff --git a/tests/features/article.feature b/tests/features/article.feature
new file mode 100644
index 0000000..1b214ff
--- /dev/null
+++ b/tests/features/article.feature
@@ -0,0 +1,30 @@
+Feature: Article manager class
+
+   Allows the user to parse and sort of dump articles
+
+   Scenario: Computing an article's label
+      Given an article with authors, year and journal
+      When I compute the label for the article
+      Then the label is a proper string
+
+   Scenario Outline: Fail to compute a label
+      Given a complete article missing <field>
+      When I try to compute the label for the article
+      Then There's an error computing the label
+
+      Examples:
+         | field   |
+         | year    |
+         | authors |
+         | journal |
+
+   Scenario: Merge two articles
+      Given a complete article
+      And theres a similar article that includes a doi
+
+      When I merge the two articles
+      And I try to compute the label for the article
+
+      Then the article's doi matches the other
+      And there's no error computing the label
+      And the label contains the doi of the other
diff --git a/tests/test_article.py b/tests/test_article.py
new file mode 100644
index 0000000..c567223
--- /dev/null
+++ b/tests/test_article.py
@@ -0,0 +1,112 @@
+from copy import deepcopy
+from dataclasses import dataclass
+from typing import Optional
+
+from pytest import fixture
+from pytest_bdd import scenarios, given, then, when, parsers
+
+from wostools.article import Article
+
+
+@dataclass
+class Context:
+    article: Optional[Article]
+    label: Optional[str] = None
+    expected_label: Optional[str] = None
+    error: Optional[Exception] = None
+
+
+scenarios("features/article.feature")
+
+
+@given("a complete article missing <field>", target_fixture="context")
+@given(parsers.parse("a complete article missing {field:w}"), target_fixture="context")
+def article_missing(field: str):
+    article = Article(
+        title=None, authors=["L, Robertson"], year=1999, journal="Science"
+    )
+    setattr(article, field, None)
+    return Context(article=article)
+
+
+@given("a complete article", target_fixture="context")
+@given("an article with authors, year and journal", target_fixture="context")
+def article_with_authors_year_and_journal():
+    return Context(
+        article=Article(
+            title=None, authors=["L, Robertson"], year=1999, journal="Science"
+        ),
+        expected_label="L Robertson, 1999, Science",
+    )
+
+
+@given("theres a similar article that includes a doi", target_fixture="other")
+def similar_article_with_doi(context: Context):
+    assert context.article, "missing article to copy"
+    article = deepcopy(context.article)
+    article.doi = "somedoi/123"
+    if context.expected_label:
+        return Context(
+            article=article,
+            expected_label=", ".join([context.expected_label, article.doi]),
+        )
+    return Context(article=article)
+
+
+@when("I compute the label for the article")
+def compute_label_for_article(context: Context):
+    assert context.article, "Missing article for this step"
+    context.label = context.article.label
+
+
+@when("I merge the two articles")
+def merge_articles(context: Context, other: Context):
+    assert context.article, "Missing article for this step"
+    assert other.article, "Missing other article for this step"
+    context.article = context.article.merge(other.article)
+    context.expected_label = None
+
+
+@when("I try to compute the label for the article")
+def try_to_compute_label(context: Context):
+    assert context.article, "Missing article for this step"
+    try:
+        context.label = context.article.label
+    except Exception as e:
+        context.error = e
+
+
+@then("the label is a proper string")
+def then_label_is_a_proper_string(context: Context):
+    assert context.expected_label
+    assert context.label
+    assert context.label == context.expected_label
+
+
+@then("the label contains the doi of the other")
+def label_matches_other(context: Context, other: Context):
+    assert context.label, "You didn't get a label in the then block"
+    assert other.article and other.article.doi, "There's no doi in the other article"
+    assert other.article.doi in context.label
+
+
+@then("There's no error computing the label")
+@then("there's no error computing the label")
+def no_error_computing_label(context: Context):
+    assert context.label
+    assert not context.error
+
+
+@then("There's an error computing the label")
+def error_computing_label(context: Context):
+    assert not context.label
+    assert context.error
+    assert isinstance(context.error, ValueError)
+
+
+@then(parsers.parse("the article matches the {field:w} of the other"))
+@then(parsers.parse("the article's {field:w} matches the other"))
+def contais_others_field(context: Context, other: Context, field: str):
+    assert context.article
+    assert other.article
+    assert getattr(context.article, field) == getattr(other.article, field)

From be5975b5db93306f6cf047dc2fb2a336eb778f1c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oscar=20Arbel=C3=A1ez?= <odarbelaeze@gmail.com>
Date: Sat, 4 Jul 2020 01:52:35 -0500
Subject: [PATCH 12/35] Require dataclasses for python<3.6

---
 .github/workflows/pythonpackage.yml | 3 +--
 setup.py                            | 7 ++++++-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml
index 7dd2afa..55ed64c 100644
--- a/.github/workflows/pythonpackage.yml
+++ b/.github/workflows/pythonpackage.yml
@@ -30,6 +30,5 @@ jobs:
         flake8 . --count --exit-zero --statistics
     - name: Test with pytest
       run: |
-        pip install pytest
-        pip install -e .
+        pip install -e .[test]
         pytest
diff --git a/setup.py b/setup.py
index c511c40..334b134 100644
--- a/setup.py
+++ b/setup.py
@@ -2,6 +2,8 @@
 
 """The setup script."""
 
+import sys
+
 from setuptools import setup, find_packages
 
 with open("README.md") as readme_file:
@@ -14,7 +16,10 @@
 
 setup_requirements = ["pytest-runner"]
 
-test_requirements = ["pytest", "pytest-bdd"]
+if sys.version_info < (3, 7):
+    test_requirements = ["pytest", "pytest-bdd", "dataclasses"]
+else:
+    test_requirements = ["pytest", "pytest-bdd"]
 
 setup(
     author="Core of Science",

From 3dcd5cf0aead1801f905b3d464e7eee77e228cd4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oscar=20Arbel=C3=A1ez?= <odarbelaeze@gmail.com>
Date: Sat, 4 Jul 2020 01:55:30 -0500
Subject: [PATCH 13/35] try with that command

---
 .github/workflows/pythonpackage.yml | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml
index 55ed64c..e905151 100644
--- a/.github/workflows/pythonpackage.yml
+++ b/.github/workflows/pythonpackage.yml
@@ -29,6 +29,4 @@ jobs:
         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
         flake8 . --count --exit-zero --statistics
     - name: Test with pytest
-      run: |
-        pip install -e .[test]
-        pytest
+      run: python setup.py test

From 2c66bd67809f4689e29facfde5a527a2fb674350 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oscar=20Arbel=C3=A1ez?= <odarbelaeze@gmail.com>
Date: Sat, 4 Jul 2020 01:59:40 -0500
Subject: [PATCH 14/35] Use renvironment specifiers

---
 requirements_dev.txt |  1 +
 setup.py             | 10 +++-------
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/requirements_dev.txt b/requirements_dev.txt
index 8881d38..0565a53 100644
--- a/requirements_dev.txt
+++ b/requirements_dev.txt
@@ -5,3 +5,4 @@ coverage==5.1
 pytest==5.4.3
 pytest-runner==5.2
 pytest-bdd==3.4.0
+dataclasses==0.7; python_version < "3.7"
diff --git a/setup.py b/setup.py
index 334b134..4c3f5ae 100644
--- a/setup.py
+++ b/setup.py
@@ -2,9 +2,7 @@
 
 """The setup script."""
 
-import sys
-
-from setuptools import setup, find_packages
+from setuptools import find_packages, setup
 
 with open("README.md") as readme_file:
     readme = readme_file.read()
@@ -16,10 +14,8 @@
 
 setup_requirements = ["pytest-runner"]
 
-if sys.version_info < (3, 7):
-    test_requirements = ["pytest", "pytest-bdd", "dataclasses"]
-else:
-    test_requirements = ["pytest", "pytest-bdd"]
+
+test_requirements = ["pytest", "pytest-bdd", 'dataclasses; python_version<"3.7"']
 
 setup(
     author="Core of Science",

From 9bef219de835d4e76fe5e83ad7cf9d91b133e6bc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oscar=20Arbel=C3=A1ez?= <odarbelaeze@gmail.com>
Date: Sat, 4 Jul 2020 11:50:09 -0500
Subject: [PATCH 15/35] Add more article tests

---
 tests/features/article.feature |   6 ++
 tests/test_article.py          | 152 +++++++++++++++++++++++++++++++--
 2 files changed, 151 insertions(+), 7 deletions(-)

diff --git a/tests/features/article.feature b/tests/features/article.feature
index 1b214ff..aee3e90 100644
--- a/tests/features/article.feature
+++ b/tests/features/article.feature
@@ -28,3 +28,9 @@ Feature: Article manager class
       Then the article's doi matches the other
       And there's no error computing the label
       And the label contains the doi of the other
+
+   Scenario: Parse article from isi text
+      Given some valid isi text
+      When I create an article from the isi text
+      Then the values in the isi text are part of the article
+      And the isi text itself is part of the articles sources
diff --git a/tests/test_article.py b/tests/test_article.py
index c567223..60eb35e 100644
--- a/tests/test_article.py
+++ b/tests/test_article.py
@@ -1,13 +1,90 @@
 from copy import deepcopy
 from dataclasses import dataclass
-from typing import Optional
+from typing import Optional, List
 
 from pytest import fixture
-from pytest_bdd import scenarios, given, then, when, parsers
+from pytest_bdd import scenarios, scenario, given, then, when, parsers
 
 from wostools.article import Article
 
 
+ISI_TEMPLATE = """
+PT J
+AU {author}
+   {second_author}
+AF {author}
+   {second_author}
+TI {title}
+SO JOURNAL OF MAGNETISM AND MAGNETIC MATERIALS
+LA English
+DT Article
+DE Electrodeposition; Structural control; Nanodot array; Bit-patterned
+   media; CoPt alloy
+ID BIT-PATTERNED MEDIA; ELECTRON-BEAM LITHOGRAPHY; RECORDING MEDIA;
+   MAGNETIC MEDIA; DENSITY; FILMS; ANISOTROPY; STORAGE
+AB CoPt nanodot arrays were fabricated by combining electrodeposition and electron beam lithography (EBL) for the use of bit-patterned media (BPM). To achieve precise control of deposition uniformity and coercivity of the CoPt nanodot arrays, their crystal structure and magnetic properties were controlled by controlling the diffusion state of metal ions from the initial deposition stage with the application of bath agitation. Following bath agitation, the composition gradient of the CoPt alloy with thickness was mitigated to have a near-ideal alloy composition of Co:Pt =80:20, which induces epitaxial-like growth from Ru substrate, thus resulting in the improvement of the crystal orientation of the hcp (002) structure from its initial deposition stages. Furthermore, the cross-sectional transmission electron microscope (TEM) analysis of the nanodots deposited with bath agitation showed CoPt growth along its c-axis oriented in the perpendicular direction, having uniform lattice fringes on the hcp (002) plane from the Ru underlayer interface, which is a significant factor to induce perpendicular magnetic anisotropy. Magnetic characterization of the CoPt nanodot arrays showed increase in the perpendicular coercivity and squareness of the hysteresis loops from 2.0 kOe and 0.64 (without agitation) to 4.0 kOe and 0.87 with bath agitation. Based on the detailed characterization of nanodot arrays, the precise crystal structure control of the nanodot arrays with ultra-high recording density by electrochemical process was successfully demonstrated.
+C1 [Wodarz, Siggi; Homma, Takayuki] Waseda Univ, Dept Appl Chem, Shinjuku Ku, Tokyo 1698555, Japan.
+   [Hasegawa, Takashi; Ishio, Shunji] Akita Univ, Dept Mat Sci, Akita 0108502, Japan.
+RP Homma, T (reprint author), Waseda Univ, Dept Appl Chem, Shinjuku Ku, Tokyo 1698555, Japan.
+EM t.homma@waseda.jp
+OI Hasegawa, Takashi/0000-0002-8178-4980
+FU JSPS KAKENHI Grant [25249104]
+FX This work was supported in part by JSPS KAKENHI Grant Number 25249104.
+CR Albrecht TR, 2013, IEEE T MAGN, V49, P773, DOI 10.1109/TMAG.2012.2227303
+   BUSCHOW KHJ, 1983, J MAGN MAGN MATER, V38, P1, DOI 10.1016/0304-8853(83)90097-5
+   Gapin AI, 2006, J APPL PHYS, V99, DOI 10.1063/1.2163289
+   Homma Takayuki, 2015, ECS Transactions, V64, P1, DOI 10.1149/06431.0001ecst
+   Kryder MH, 2008, P IEEE, V96, P1810, DOI 10.1109/JPROC.2008.2004315
+   Kubo T, 2005, J APPL PHYS, V97, DOI 10.1063/1.1855572
+   Lodder JC, 2004, J MAGN MAGN MATER, V272, P1692, DOI 10.1016/j.jmmm.2003.12.259
+   Mitsuzuka K, 2007, IEEE T MAGN, V43, P2160, DOI 10.1109/TMAG.2007.893129
+   Ouchi T, 2010, ELECTROCHIM ACTA, V55, P8081, DOI 10.1016/j.electacta.2010.02.073
+   Pattanaik G, 2006, J APPL PHYS, V99, DOI 10.1063/1.2150805
+   Pattanaik G, 2007, ELECTROCHIM ACTA, V52, P2755, DOI 10.1016/j.electacta.2006.07.062
+   Piramanayagam SN, 2009, J MAGN MAGN MATER, V321, P485, DOI 10.1016/j.jmmm.2008.05.007
+   Ross CA, 2008, MRS BULL, V33, P838, DOI 10.1557/mrs2008.179
+   Shiroishi Y, 2009, IEEE T MAGN, V45, P3816, DOI 10.1109/TMAG.2009.2024879
+   Sirtori V, 2011, ACS APPL MATER INTER, V3, P1800, DOI 10.1021/am200267u
+   Sohn JS, 2009, NANOTECHNOLOGY, V20, DOI 10.1088/0957-4484/20/2/025302
+   Sun SH, 2000, SCIENCE, V287, P1989, DOI 10.1126/science.287.5460.1989
+   Terris BD, 2007, MICROSYST TECHNOL, V13, P189, DOI 10.1007/s00542-006-0144-9
+   Wang JP, 2008, P IEEE, V96, P1847, DOI 10.1109/JPROC.2008.2004318
+   Weller D, 1999, IEEE T MAGN, V35, P4423, DOI 10.1109/20.809134
+   Weller D, 2000, IEEE T MAGN, V36, P10, DOI 10.1109/20.824418
+   Wodarz S, 2016, ELECTROCHIM ACTA, V197, P330, DOI 10.1016/j.electacta.2015.11.136
+   Xu X, 2012, J ELECTROCHEM SOC, V159, pD240, DOI 10.1149/2.090204jes
+   Yang X, 2007, J VAC SCI TECHNOL B, V25, P2202, DOI 10.1116/1.2798711
+   Yang XM, 2009, ACS NANO, V3, P1844, DOI 10.1021/nn900073r
+   Yasui N, 2003, APPL PHYS LETT, V83, P3347, DOI 10.1063/1.1622787
+   Yua H., 2009, J APPL PHYS, V105
+   Zhu JG, 2008, IEEE T MAGN, V44, P125, DOI 10.1109/TMAG.2007.911031
+NR 28
+TC 0
+Z9 0
+U1 21
+U2 21
+PU ELSEVIER SCIENCE BV
+PI AMSTERDAM
+PA PO BOX 211, 1000 AE AMSTERDAM, NETHERLANDS
+SN 0304-8853
+EI 1873-4766
+J9 {journal}
+JI J. Magn. Magn. Mater.
+PD MAY 15
+PY {year}
+VL {volume}
+BP {page}
+EP 58
+DI {doi}
+PG 7
+WC Materials Science, Multidisciplinary; Physics, Condensed Matter
+SC Materials Science; Physics
+GA EP2GP
+UT WOS:000397201600008
+ER
+""".strip()
+
+
 @dataclass
 class Context:
     article: Optional[Article]
@@ -16,11 +93,44 @@ class Context:
     error: Optional[Exception] = None
 
 
+@dataclass
+class ParseContext:
+    history: Optional[List[Article]] = None
+    error: Optional[Exception] = None
+    article: Optional[Article] = None
+
+    def push(self, article: Article):
+        if self.history is None:
+            self.history = []
+        self.history.append(article)
+        self.article = article
+        self.error = None
+
+
 scenarios("features/article.feature")
 
 
+@fixture
+def attributes():
+    return {
+        "title": "some title",
+        "author": "John Doe",
+        "second_author": "Jane Doe",
+        "authors": ["John Doe", "Jane Doe"],
+        "year": 1994,
+        "page": "1330-5",
+        "journal": "J MAGN MAGN MATER",
+        "volume": "1000",
+        "doi": "10.1016/j.jmmm.2017.01.061",
+    }
+
+
+@fixture
+def parse_context():
+    return ParseContext()
+
+
 @given("a complete article missing <field>", target_fixture="context")
-@given(parsers.parse("a complete article missing {field:w}"), target_fixture="context")
 def article_missing(field: str):
     article = Article(
         title=None, authors=["L, Robertson"], year=1999, journal="Science"
@@ -53,10 +163,9 @@ def similar_article_with_doi(context: Context):
     return Context(article=article)
 
 
-@when("I compute the label for the article")
-def compute_label_for_article(context: Context):
-    assert context.article, "Missing article for this step"
-    context.label = context.article.label
+@given("some valid isi text", target_fixture="isi_text")
+def valid_isi_text(attributes):
+    return ISI_TEMPLATE.format(**attributes)
 
 
 @when("I merge the two articles")
@@ -68,6 +177,7 @@ def merge_articles(context: Context, other: Context):
 
 
 @when("I try to compute the label for the article")
+@when("I compute the label for the article")
 def try_to_compute_label(context: Context):
     assert context.article, "Missing article for this step"
     try:
@@ -76,6 +186,12 @@ def try_to_compute_label(context: Context):
         context.error = e
 
 
+@when("I create an article from the isi text")
+def create_article_from_isi_text(isi_text, parse_context):
+    article = Article.from_isi_text(isi_text)
+    parse_context.push(article)
+
+
 @then("the label is a proper string")
 def then_label_is_a_proper_string(context: Context):
     assert context.expected_label
@@ -110,3 +226,25 @@ def contais_others_field(context: Context, other: Context, field: str):
     assert context.article
     assert other.article
     assert getattr(context.article, field) == getattr(other.article, field)
+
+
+@then("the values in the isi text are part of the article")
+def values_make_it_to_the_article(parse_context: ParseContext, attributes: dict):
+    assert parse_context.article, "no article parsed yet"
+    for field in [
+        "title",
+        "authors",
+        "year",
+        "page",
+        "journal",
+        "volume",
+        "doi",
+    ]:
+        assert getattr(parse_context.article, field)
+        assert getattr(parse_context.article, field) == attributes[field]
+
+
+@then("the isi text itself is part of the articles sources")
+def isi_text_in_sources(parse_context: ParseContext, isi_text: str):
+    assert parse_context.article, "no article parsed yet"
+    assert isi_text in parse_context.article.sources

From 00c87a545e224631def228f2fe7248bf56e6a961 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oscar=20Arbel=C3=A1ez?= <odarbelaeze@gmail.com>
Date: Sat, 11 Jul 2020 17:38:04 -0500
Subject: [PATCH 16/35] Finish testing article

---
 .gitignore                     |   1 +
 Makefile                       |  14 ++-
 requirements_dev.txt           |   1 +
 tests/features/article.feature |  21 ++++
 tests/test_article.py          | 210 ++++++++++++++++++++++++++++-----
 5 files changed, 214 insertions(+), 33 deletions(-)

diff --git a/.gitignore b/.gitignore
index f2c6ad8..696b4cf 100644
--- a/.gitignore
+++ b/.gitignore
@@ -121,3 +121,4 @@ data/
 
 # json files
 *.json
+.testmondata
diff --git a/Makefile b/Makefile
index 1c88bfb..a8fb570 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,8 @@
-.PHONY: clean clean-test clean-pyc clean-build docs help
+.PHONY: clean clean-test clean-pyc clean-build docs help, test-watch
 .DEFAULT_GOAL := help
 
+NOTIFY_FILE := /tmp/pytest-$$(pwd | md5sum | cut -d " " -f 1)
+
 define BROWSER_PYSCRIPT
 import os, webbrowser, sys
 
@@ -54,7 +56,15 @@ lint: ## check style with flake8
 	flake8 wostools tests
 
 test: ## run tests quickly with the default Python
-	py.test
+	python -m pytest
+
+test-watch:
+	ptw \
+		--ext "py,feature" \
+		--onpass "coverage report --skip-empty --skip-covered -m" \
+		--onfail "notify-send.py -R $(NOTIFY_FILE) -i face-worried --hint int:transient:1 'Test failed' 'Ooops we have a problem, not all tests passed'" \
+		--onexit "notify-send.py -R $(NOTIFY_FILE) -i media-playback-stop --hint int:transient:1 'Test runner stopped' 'Just so you know, the test runner stopped'" \
+		--runner "coverage run --source wostools -m pytest" \
 
 coverage: ## check code coverage quickly with the default Python
 	coverage run --source wostools -m pytest
diff --git a/requirements_dev.txt b/requirements_dev.txt
index 0565a53..afcf706 100644
--- a/requirements_dev.txt
+++ b/requirements_dev.txt
@@ -4,5 +4,6 @@ coverage==5.1
 
 pytest==5.4.3
 pytest-runner==5.2
+pytest-watch==4.2.0
 pytest-bdd==3.4.0
 dataclasses==0.7; python_version < "3.7"
diff --git a/tests/features/article.feature b/tests/features/article.feature
index aee3e90..d156351 100644
--- a/tests/features/article.feature
+++ b/tests/features/article.feature
@@ -34,3 +34,24 @@ Feature: Article manager class
       When I create an article from the isi text
       Then the values in the isi text are part of the article
       And the isi text itself is part of the articles sources
+
+   Scenario: Parse article from invalid isi text
+      Given some isi text with invalid lines
+      When I create an article from the isi text
+      Then an invalid line error is risen
+
+   Scenario: Turn an article to dict
+      Given a reference article
+      When I turn the article into a dict
+      Then I get a reference dict of values
+
+   Scenario: Parse article from citation
+      Given some valid isi citation
+      When I create an article from the citation
+      Then the values of the citation are part of the article
+      And the citation itself is part of the articles sources
+
+   Scenario: Parse article from an invalid citation
+      Given some invalid isi citation
+      When I create an article from the citation
+      Then an invalid reference error is risen
\ No newline at end of file
diff --git a/tests/test_article.py b/tests/test_article.py
index 60eb35e..3dec07f 100644
--- a/tests/test_article.py
+++ b/tests/test_article.py
@@ -1,12 +1,13 @@
+from contextlib import contextmanager
 from copy import deepcopy
 from dataclasses import dataclass
-from typing import Optional, List
+from typing import Dict, Generic, List, Optional, TypeVar, Iterator
 
 from pytest import fixture
-from pytest_bdd import scenarios, scenario, given, then, when, parsers
+from pytest_bdd import given, parsers, scenario, scenarios, then, when
 
 from wostools.article import Article
-
+from wostools.exceptions import InvalidIsiLine, InvalidReference
 
 ISI_TEMPLATE = """
 PT J
@@ -93,18 +94,41 @@ class Context:
     error: Optional[Exception] = None
 
 
+T = TypeVar("T")
+
+
 @dataclass
-class ParseContext:
-    history: Optional[List[Article]] = None
+class OperationContext(Generic[T]):
+    history: Optional[List[T]] = None
     error: Optional[Exception] = None
-    article: Optional[Article] = None
+    data: Optional[T] = None
 
-    def push(self, article: Article):
+    def push(self, data: Optional[T], error: Optional[Exception] = None):
         if self.history is None:
             self.history = []
-        self.history.append(article)
-        self.article = article
-        self.error = None
+        if self.data:
+            self.history.append(self.data)
+        self.data = data
+        self.error = error
+
+    @contextmanager
+    def capture(self):
+        try:
+            yield
+        except Exception as e:
+            self.push(None, error=e)
+
+    @contextmanager
+    def assert_data(self, name=None) -> Iterator[T]:
+        if name is None:
+            name = "data"
+        assert self.data, f"No {name} computed yet"
+        yield self.data
+
+    @contextmanager
+    def assert_error(self) -> Iterator[Exception]:
+        assert self.error, f"Expected an error and found none"
+        yield self.error
 
 
 scenarios("features/article.feature")
@@ -126,8 +150,36 @@ def attributes():
 
 
 @fixture
-def parse_context():
-    return ParseContext()
+def citation_attributes():
+    # Kryder MH, 2008, P IEEE, V96, P1810, DOI 10.1109/JPROC.2008.2004315
+    return {
+        "author": "L Antuan",
+        "year": "2008",
+        "journal": "P IEEE",
+        "volume": "69",
+        "page": "1810",
+        "doi": "DOI 10.1109/JPROC.2008.2004315",
+    }
+
+
+@fixture
+def label_context() -> OperationContext[str]:
+    return OperationContext()
+
+
+@fixture
+def parse_context() -> OperationContext[Article]:
+    return OperationContext()
+
+
+@fixture
+def citation_parse_context() -> OperationContext[Article]:
+    return OperationContext()
+
+
+@fixture
+def to_dict_context() -> OperationContext[Dict]:
+    return OperationContext()
 
 
 @given("a complete article missing <field>", target_fixture="context")
@@ -168,6 +220,44 @@ def valid_isi_text(attributes):
     return ISI_TEMPLATE.format(**attributes)
 
 
+@given("some isi text with invalid lines", target_fixture="isi_text")
+def invalid_lines_in_isi_text(attributes):
+    return """
+    INVALIDKEY This value is going to die
+    """.strip()
+
+
+@given("some invalid isi citation", target_fixture="isi_citation")
+def invalid_isi_citation():
+    return "Da Lambert, Hello"
+
+
+@given("some valid isi citation", target_fixture="isi_citation")
+def valid_isi_citation(citation_attributes):
+    return "{author}, {year}, {journal}, V{volume}, P{page}, DOI {doi}".format(
+        **citation_attributes
+    )
+
+
+@given("a reference article", target_fixture="context")
+def reference_article(attributes):
+    return Context(
+        article=Article(
+            title=attributes.get("title"),
+            authors=attributes.get("authors"),
+            year=attributes.get("year"),
+            journal=attributes.get("journal"),
+            volume=attributes.get("volume"),
+            page=attributes.get("page"),
+            doi=attributes.get("doi"),
+            references=attributes.get("references"),
+            keywords=attributes.get("keywords"),
+            sources=attributes.get("sources"),
+            extra=attributes.get("extra"),
+        )
+    )
+
+
 @when("I merge the two articles")
 def merge_articles(context: Context, other: Context):
     assert context.article, "Missing article for this step"
@@ -186,10 +276,27 @@ def try_to_compute_label(context: Context):
         context.error = e
 
 
+@when("I turn the article into a dict")
+def try_to_go_to_dict(context: Context, to_dict_context: OperationContext[Dict]):
+    assert context.article, "Missing article for this step"
+    with to_dict_context.capture():
+        to_dict_context.push(context.article.to_dict())
+
+
 @when("I create an article from the isi text")
-def create_article_from_isi_text(isi_text, parse_context):
-    article = Article.from_isi_text(isi_text)
-    parse_context.push(article)
+def create_article_from_isi_text(isi_text, parse_context: OperationContext[Article]):
+    assert isi_text, "define some isi text to parse"
+    with parse_context.capture():
+        parse_context.push(Article.from_isi_text(isi_text))
+
+
+@when("I create an article from the citation")
+def create_article_from_citation(
+    isi_citation, citation_parse_context: OperationContext[Article]
+):
+    assert isi_citation, "define some isi citation to parse"
+    with citation_parse_context.capture():
+        citation_parse_context.push(Article.from_isi_citation(isi_citation))
 
 
 @then("the label is a proper string")
@@ -229,22 +336,63 @@ def contais_others_field(context: Context, other: Context, field: str):
 
 
 @then("the values in the isi text are part of the article")
-def values_make_it_to_the_article(parse_context: ParseContext, attributes: dict):
-    assert parse_context.article, "no article parsed yet"
-    for field in [
-        "title",
-        "authors",
-        "year",
-        "page",
-        "journal",
-        "volume",
-        "doi",
-    ]:
-        assert getattr(parse_context.article, field)
-        assert getattr(parse_context.article, field) == attributes[field]
+def values_make_it_to_the_article(
+    parse_context: OperationContext[Article], attributes: dict
+):
+    with parse_context.assert_data() as article:
+        for field in [
+            "title",
+            "authors",
+            "year",
+            "page",
+            "journal",
+            "volume",
+            "doi",
+        ]:
+            assert getattr(article, field)
+            assert getattr(article, field) == attributes[field]
+
+
+@then("the values of the citation are part of the article")
+def citation_values_make_it_to_article(
+    citation_parse_context: OperationContext[Article], citation_attributes: dict
+):
+    with citation_parse_context.assert_data() as article:
+        assert article.authors == [citation_attributes["author"]]
+        for field in ["year", "journal", "page", "volume", "doi"]:
+            assert str(getattr(article, field)) == citation_attributes[field]
 
 
 @then("the isi text itself is part of the articles sources")
-def isi_text_in_sources(parse_context: ParseContext, isi_text: str):
-    assert parse_context.article, "no article parsed yet"
-    assert isi_text in parse_context.article.sources
+def isi_text_in_sources(parse_context: OperationContext[Article], isi_text: str):
+    assert parse_context.data, "no article parsed yet"
+    assert isi_text in parse_context.data.sources
+
+
+@then("the citation itself is part of the articles sources")
+def citation_in_sources(
+    citation_parse_context: OperationContext[Article], isi_citation: str
+):
+    with citation_parse_context.assert_data() as article:
+        assert isi_citation in article.sources
+
+
+@then("an invalid line error is risen")
+def invialid_isi_line_risen(parse_context: OperationContext[Article]):
+    with parse_context.assert_error() as error:
+        assert isinstance(error, InvalidIsiLine)
+
+
+@then("an invalid reference error is risen")
+def invialid_reference_risen(citation_parse_context: OperationContext[Article]):
+    with citation_parse_context.assert_error() as error:
+        assert isinstance(error, InvalidReference)
+
+
+@then("I get a reference dict of values")
+def get_a_reference_dict(to_dict_context: OperationContext[Dict], attributes: Dict):
+    with to_dict_context.assert_data() as article_dict:
+        assert any(article_dict.values()), "your dict has no values son"
+        for key, value in article_dict.items():
+            assert not value or key in attributes
+            assert not value or value == attributes[key]

From aeeeb58c29c168c07dd8e39c4d03e62cbad23bbd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oscar=20Arbel=C3=A1ez?= <odarbelaeze@gmail.com>
Date: Sat, 11 Jul 2020 17:55:31 -0500
Subject: [PATCH 17/35] Make the code a bit shorter

---
 Makefile              |   6 +-
 tests/test_article.py | 135 ++++++++++++++++++++----------------------
 2 files changed, 66 insertions(+), 75 deletions(-)

diff --git a/Makefile b/Makefile
index a8fb570..915fa5a 100644
--- a/Makefile
+++ b/Makefile
@@ -59,11 +59,11 @@ test: ## run tests quickly with the default Python
 	python -m pytest
 
 test-watch:
-	ptw \
+	@ptw \
 		--ext "py,feature" \
 		--onpass "coverage report --skip-empty --skip-covered -m" \
-		--onfail "notify-send.py -R $(NOTIFY_FILE) -i face-worried --hint int:transient:1 'Test failed' 'Ooops we have a problem, not all tests passed'" \
-		--onexit "notify-send.py -R $(NOTIFY_FILE) -i media-playback-stop --hint int:transient:1 'Test runner stopped' 'Just so you know, the test runner stopped'" \
+		--onfail "notify-send.sh -R $(NOTIFY_FILE) -i face-worried --hint int:transient:1 'Test failed' 'Ooops we have a problem, not all tests passed'" \
+		--onexit "notify-send.sh -R $(NOTIFY_FILE) -i media-playback-stop --hint int:transient:1 'Test runner stopped' 'Just so you know, the test runner stopped'" \
 		--runner "coverage run --source wostools -m pytest" \
 
 coverage: ## check code coverage quickly with the default Python
diff --git a/tests/test_article.py b/tests/test_article.py
index 3dec07f..cc34768 100644
--- a/tests/test_article.py
+++ b/tests/test_article.py
@@ -87,18 +87,16 @@
 
 
 @dataclass
-class Context:
+class ArticleWrapper:
     article: Optional[Article]
     label: Optional[str] = None
-    expected_label: Optional[str] = None
-    error: Optional[Exception] = None
 
 
 T = TypeVar("T")
 
 
 @dataclass
-class OperationContext(Generic[T]):
+class Context(Generic[T]):
     history: Optional[List[T]] = None
     error: Optional[Exception] = None
     data: Optional[T] = None
@@ -163,56 +161,55 @@ def citation_attributes():
 
 
 @fixture
-def label_context() -> OperationContext[str]:
-    return OperationContext()
+def label_context() -> Context[str]:
+    return Context()
 
 
 @fixture
-def parse_context() -> OperationContext[Article]:
-    return OperationContext()
+def parse_context() -> Context[Article]:
+    return Context()
 
 
 @fixture
-def citation_parse_context() -> OperationContext[Article]:
-    return OperationContext()
+def citation_parse_context() -> Context[Article]:
+    return Context()
 
 
 @fixture
-def to_dict_context() -> OperationContext[Dict]:
-    return OperationContext()
+def to_dict_context() -> Context[Dict]:
+    return Context()
 
 
-@given("a complete article missing <field>", target_fixture="context")
+@given("a complete article missing <field>", target_fixture="wrapper")
 def article_missing(field: str):
     article = Article(
         title=None, authors=["L, Robertson"], year=1999, journal="Science"
     )
     setattr(article, field, None)
-    return Context(article=article)
+    return ArticleWrapper(article=article)
 
 
-@given("a complete article", target_fixture="context")
-@given("an article with authors, year and journal", target_fixture="context")
+@given("a complete article", target_fixture="wrapper")
+@given("an article with authors, year and journal", target_fixture="wrapper")
 def article_with_authors_year_and_journal():
-    return Context(
+    return ArticleWrapper(
         article=Article(
             title=None, authors=["L, Robertson"], year=1999, journal="Science"
         ),
-        expected_label="L Robertson, 1999, Science",
+        label="L Robertson, 1999, Science",
     )
 
 
 @given("theres a similar article that includes a doi", target_fixture="other")
-def similar_article_with_doi(context: Context):
-    assert context.article, "missing article to copy"
-    article = deepcopy(context.article)
+def similar_article_with_doi(wrapper: ArticleWrapper):
+    assert wrapper.article, "missing article to copy"
+    article = deepcopy(wrapper.article)
     article.doi = "somedoi/123"
-    if context.expected_label:
-        return Context(
-            article=article,
-            expected_label=", ".join([context.expected_label, article.doi]),
+    if wrapper.label:
+        return ArticleWrapper(
+            article=article, label=", ".join([wrapper.label, article.doi]),
         )
-    return Context(article=article)
+    return ArticleWrapper(article=article)
 
 
 @given("some valid isi text", target_fixture="isi_text")
@@ -239,9 +236,9 @@ def valid_isi_citation(citation_attributes):
     )
 
 
-@given("a reference article", target_fixture="context")
+@given("a reference article", target_fixture="wrapper")
 def reference_article(attributes):
-    return Context(
+    return ArticleWrapper(
         article=Article(
             title=attributes.get("title"),
             authors=attributes.get("authors"),
@@ -259,32 +256,30 @@ def reference_article(attributes):
 
 
 @when("I merge the two articles")
-def merge_articles(context: Context, other: Context):
-    assert context.article, "Missing article for this step"
+def merge_articles(wrapper: ArticleWrapper, other: ArticleWrapper):
+    assert wrapper.article, "Missing article for this step"
     assert other.article, "Missing other article for this step"
-    context.article = context.article.merge(other.article)
-    context.expected_label = None
+    wrapper.article = wrapper.article.merge(other.article)
+    wrapper.label = None
 
 
 @when("I try to compute the label for the article")
 @when("I compute the label for the article")
-def try_to_compute_label(context: Context):
-    assert context.article, "Missing article for this step"
-    try:
-        context.label = context.article.label
-    except Exception as e:
-        context.error = e
+def try_to_compute_label(label_context: Context[str], wrapper: ArticleWrapper):
+    assert wrapper.article, "Missing article for this step"
+    with label_context.capture():
+        label_context.push(wrapper.article.label)
 
 
 @when("I turn the article into a dict")
-def try_to_go_to_dict(context: Context, to_dict_context: OperationContext[Dict]):
-    assert context.article, "Missing article for this step"
+def try_to_go_to_dict(wrapper: ArticleWrapper, to_dict_context: Context[Dict]):
+    assert wrapper.article, "Missing article for this step"
     with to_dict_context.capture():
-        to_dict_context.push(context.article.to_dict())
+        to_dict_context.push(wrapper.article.to_dict())
 
 
 @when("I create an article from the isi text")
-def create_article_from_isi_text(isi_text, parse_context: OperationContext[Article]):
+def create_article_from_isi_text(isi_text, parse_context: Context[Article]):
     assert isi_text, "define some isi text to parse"
     with parse_context.capture():
         parse_context.push(Article.from_isi_text(isi_text))
@@ -292,7 +287,7 @@ def create_article_from_isi_text(isi_text, parse_context: OperationContext[Artic
 
 @when("I create an article from the citation")
 def create_article_from_citation(
-    isi_citation, citation_parse_context: OperationContext[Article]
+    isi_citation, citation_parse_context: Context[Article]
 ):
     assert isi_citation, "define some isi citation to parse"
     with citation_parse_context.capture():
@@ -300,45 +295,43 @@ def create_article_from_citation(
 
 
 @then("the label is a proper string")
-def then_label_is_a_proper_string(context: Context):
-    assert context.expected_label
-    assert context.label
-    assert context.label == context.expected_label
+def then_label_is_a_proper_string(label_context: Context[str], wrapper: ArticleWrapper):
+    with label_context.assert_data() as label:
+        assert label == wrapper.label
 
 
 @then("the label contains the doi of the other")
-def label_matches_other(context: Context, other: Context):
-    assert context.label, "You didn't get a label in the then block"
-    assert other.article and other.article.doi, "There's no doi in the other article"
-    assert other.article.doi in context.label
+def label_matches_other(label_context: Context[str], other: ArticleWrapper):
+    with label_context.assert_data() as label:
+        assert (
+            other.article and other.article.doi
+        ), "There's no doi in the other article"
+        assert other.article.doi in label
 
 
 @then("There's no error computing the label")
 @then("there's no error computing the label")
-def no_error_computing_label(context: Context):
-    assert context.label
-    assert not context.error
+def no_error_computing_label(label_context: Context[str]):
+    with label_context.assert_data():
+        pass
 
 
 @then("There's an error computing the label")
-def error_computing_label(context: Context):
-    assert not context.label
-    assert context.error
-    assert isinstance(context.error, ValueError)
+def error_computing_label(label_context: Context[str]):
+    with label_context.assert_error() as error:
+        assert isinstance(error, ValueError)
 
 
 @then(parsers.parse("the article matches the {field:w} of the other"))
 @then(parsers.parse("the article's {field:w} matches the other"))
-def contais_others_field(context: Context, other: Context, field: str):
-    assert context.article
+def contais_others_field(wrapper: ArticleWrapper, other: ArticleWrapper, field: str):
+    assert wrapper.article
     assert other.article
-    assert getattr(context.article, field) == getattr(other.article, field)
+    assert getattr(wrapper.article, field) == getattr(other.article, field)
 
 
 @then("the values in the isi text are part of the article")
-def values_make_it_to_the_article(
-    parse_context: OperationContext[Article], attributes: dict
-):
+def values_make_it_to_the_article(parse_context: Context[Article], attributes: dict):
     with parse_context.assert_data() as article:
         for field in [
             "title",
@@ -355,7 +348,7 @@ def values_make_it_to_the_article(
 
 @then("the values of the citation are part of the article")
 def citation_values_make_it_to_article(
-    citation_parse_context: OperationContext[Article], citation_attributes: dict
+    citation_parse_context: Context[Article], citation_attributes: dict
 ):
     with citation_parse_context.assert_data() as article:
         assert article.authors == [citation_attributes["author"]]
@@ -364,33 +357,31 @@ def citation_values_make_it_to_article(
 
 
 @then("the isi text itself is part of the articles sources")
-def isi_text_in_sources(parse_context: OperationContext[Article], isi_text: str):
+def isi_text_in_sources(parse_context: Context[Article], isi_text: str):
     assert parse_context.data, "no article parsed yet"
     assert isi_text in parse_context.data.sources
 
 
 @then("the citation itself is part of the articles sources")
-def citation_in_sources(
-    citation_parse_context: OperationContext[Article], isi_citation: str
-):
+def citation_in_sources(citation_parse_context: Context[Article], isi_citation: str):
     with citation_parse_context.assert_data() as article:
         assert isi_citation in article.sources
 
 
 @then("an invalid line error is risen")
-def invialid_isi_line_risen(parse_context: OperationContext[Article]):
+def invialid_isi_line_risen(parse_context: Context[Article]):
     with parse_context.assert_error() as error:
         assert isinstance(error, InvalidIsiLine)
 
 
 @then("an invalid reference error is risen")
-def invialid_reference_risen(citation_parse_context: OperationContext[Article]):
+def invialid_reference_risen(citation_parse_context: Context[Article]):
     with citation_parse_context.assert_error() as error:
         assert isinstance(error, InvalidReference)
 
 
 @then("I get a reference dict of values")
-def get_a_reference_dict(to_dict_context: OperationContext[Dict], attributes: Dict):
+def get_a_reference_dict(to_dict_context: Context[Dict], attributes: Dict):
     with to_dict_context.assert_data() as article_dict:
         assert any(article_dict.values()), "your dict has no values son"
         for key, value in article_dict.items():

From 6555d4f2c95e3296d6c50a23da12743006b0c68b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oscar=20Arbel=C3=A1ez?= <odarbelaeze@gmail.com>
Date: Sat, 18 Jul 2020 10:16:03 -0500
Subject: [PATCH 18/35] Add some tests for fields

---
 tests/test_fields.py | 24 ++++++++++++++++++++++++
 wostools/fields.py   |  7 ++++++-
 2 files changed, 30 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_fields.py

diff --git a/tests/test_fields.py b/tests/test_fields.py
new file mode 100644
index 0000000..bc18934
--- /dev/null
+++ b/tests/test_fields.py
@@ -0,0 +1,24 @@
+import pytest
+
+from wostools.fields import joined, delimited, integer
+
+
+def test_joined_joins_sequences():
+    assert joined(["hello", "world"]) == "hello world"
+
+
+def test_delimited_split_strings():
+    assert delimited(["key; word;", "more; words"]) == ["key", "word", "more", "words"]
+
+
+def test_delimited_split_strings_no_semi_at_the_end():
+    assert delimited(["key; word", "more; words"]) == ["key", "word", "more", "words"]
+
+
+def test_integer_integer_makes_an_integer():
+    assert integer(["1"]) == 1
+
+
+def test_integer_raises_if_more_than_one_value_is_passed():
+    with pytest.raises(ValueError):
+        integer(["", ""])
diff --git a/wostools/fields.py b/wostools/fields.py
index e8b83d9..e6fc31b 100644
--- a/wostools/fields.py
+++ b/wostools/fields.py
@@ -20,7 +20,12 @@ def ident(seq):
 
 
 def delimited(seq, delimiter="; "):
-    return joined(seq).split(delimiter)
+    return [
+        word.replace(delimiter.strip(), "")
+        for words in seq
+        for word in words.split(delimiter)
+        if word
+    ]
 
 
 def integer(seq):

From 1339ecf872414203eb83689e14d07e4af116b591 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oscar=20Arbel=C3=A1ez?= <odarbelaeze@gmail.com>
Date: Sat, 18 Jul 2020 11:09:32 -0500
Subject: [PATCH 19/35] Cover the last bits of fields

---
 tests/test_fields.py | 17 ++++++++++++++++-
 wostools/fields.py   | 14 --------------
 2 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/tests/test_fields.py b/tests/test_fields.py
index bc18934..04097ed 100644
--- a/tests/test_fields.py
+++ b/tests/test_fields.py
@@ -1,6 +1,6 @@
 import pytest
 
-from wostools.fields import joined, delimited, integer
+from wostools.fields import joined, delimited, integer, parse
 
 
 def test_joined_joins_sequences():
@@ -22,3 +22,18 @@ def test_integer_integer_makes_an_integer():
 def test_integer_raises_if_more_than_one_value_is_passed():
     with pytest.raises(ValueError):
         integer(["", ""])
+
+
+@pytest.mark.parametrize("header", ["VR", "FN"])
+def test_parse_ignores_headers(header):
+    assert parse(header, ["value", "value"]) == {}
+
+
+def test_parse_raises_on_unknown_fields():
+    with pytest.raises(ValueError):
+        assert parse("FG", ["value", "value"]) == {}
+
+
+def test_parse_raises_on_invalid_values():
+    with pytest.raises(ValueError):
+        assert parse("PY", ["1994b"]) == {}
diff --git a/wostools/fields.py b/wostools/fields.py
index e6fc31b..75411b9 100644
--- a/wostools/fields.py
+++ b/wostools/fields.py
@@ -35,10 +35,6 @@ def integer(seq):
     return int(first.strip())
 
 
-def unknown(key) -> IsiField:
-    return IsiField(key, key, joined, [])
-
-
 FIELDS = {
     "AB": IsiField("AB", "Abstract", joined, ["abstract"]),
     "AF": IsiField("AF", "Author Full Names", ident, ["author_full_names"]),
@@ -197,16 +193,6 @@ def parse(key: str, value: List) -> Dict:
         return {k: parsed for k in [key, *field.aliases]}
     except ValueError as e:
         raise ValueError(f"Field {key}: {e}")
-    except AttributeError as e:
-        raise AttributeError(f"Field {key}: {e}")
-
-
-def alias(raw: Dict) -> Dict:
-    output: Dict[str, Any] = {}
-    for key, value in output:
-        field = FIELDS.get(key, unknown(key))
-        output.update({k: value for k in [key, *field.aliases]})
-    return output
 
 
 def parse_all(raw_dict: Dict[str, List[str]]) -> Mapping[str, Any]:

From c8613a435d8c3ebbdc1bb37cf2fe9a77ac25d72c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oscar=20Arbel=C3=A1ez?= <odarbelaeze@gmail.com>
Date: Sat, 18 Jul 2020 11:17:18 -0500
Subject: [PATCH 20/35] Update that missing label fields error

---
 wostools/article.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/wostools/article.py b/wostools/article.py
index 4f4ffac..98da0aa 100644
--- a/wostools/article.py
+++ b/wostools/article.py
@@ -3,8 +3,8 @@
 import re
 from typing import Any, List, Mapping, Optional, Set
 
+from wostools.exceptions import InvalidIsiLine, InvalidReference, MissingLabelFields
 from wostools.fields import parse_all
-from wostools.exceptions import InvalidReference, InvalidIsiLine
 
 logger = logging.getLogger(__name__)
 
@@ -53,7 +53,7 @@ def __init__(
     @property
     def label(self):
         if not (self.authors and self.year and self.journal):
-            raise ValueError(self)
+            raise MissingLabelFields(self)
         pieces = {
             "AU": self.authors[0].replace(",", ""),
             "PY": str(self.year),

From 820c1ef506f3274297147a4bd54cef24056f170c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oscar=20Arbel=C3=A1ez?= <odarbelaeze@gmail.com>
Date: Sun, 2 Aug 2020 12:56:27 -0500
Subject: [PATCH 21/35] Start testing the cached collection class

---
 tests/features/cached.feature   |  50 ++++++++++++
 tests/test_article.py           |  44 +---------
 tests/test_collection_cached.py | 137 ++++++++++++++++++++++++++++++++
 wostools/_testutils.py          |  39 +++++++++
 4 files changed, 230 insertions(+), 40 deletions(-)
 create mode 100644 tests/features/cached.feature
 create mode 100644 tests/test_collection_cached.py
 create mode 100644 wostools/_testutils.py

diff --git a/tests/features/cached.feature b/tests/features/cached.feature
new file mode 100644
index 0000000..85f8c4f
--- /dev/null
+++ b/tests/features/cached.feature
@@ -0,0 +1,50 @@
+Feature: cached collection
+
+   We want this kind of collection to avoid duplication at all costs
+
+   Scenario: preheat cache
+
+      Given some valid isi text
+      When I create a collection from that text
+      Then the collection's cache is preheated
+
+   Scenario: collection list articles and references
+
+      Given a valid collection
+      When I iterate over the collection
+      Then all articles and references are present
+
+   Scenario: list authors
+
+      Given a valid collection
+      When I iterate over the collection authors
+      Then all authors are included
+      And the author list include duplicates
+
+   Scenario: list coauthors
+
+      Given a valid collection
+      When I iterate over the collection coauthors
+      Then all coauthor pairs are included
+      And the coauthor list include duplicates
+
+   Scenario: duplicated articles are removed
+
+      Given somve valid isi text
+      When I create a collection from that text
+      And I create a collection from twice that text
+      Then both collections have the same number of articles
+
+   Scenario: citation pairs
+
+      Given a valid collection
+      When I list the collection's citation pairs
+      Then all citation pairs are included
+
+   Scenario: citation pairs include complete info from references
+
+      Given some valid isi record
+      And a diferent isi record that references the former
+      When I create a collection from that text
+      And I list the collection's citation pairs
+      Then the citation always include all the available data
\ No newline at end of file
diff --git a/tests/test_article.py b/tests/test_article.py
index cc34768..ec5a7ff 100644
--- a/tests/test_article.py
+++ b/tests/test_article.py
@@ -1,14 +1,15 @@
-from contextlib import contextmanager
 from copy import deepcopy
 from dataclasses import dataclass
-from typing import Dict, Generic, List, Optional, TypeVar, Iterator
+from typing import Dict, Optional
 
 from pytest import fixture
-from pytest_bdd import given, parsers, scenario, scenarios, then, when
+from pytest_bdd import given, parsers, scenarios, then, when
 
 from wostools.article import Article
 from wostools.exceptions import InvalidIsiLine, InvalidReference
 
+from wostools._testutils import Context
+
 ISI_TEMPLATE = """
 PT J
 AU {author}
@@ -92,43 +93,6 @@ class ArticleWrapper:
     label: Optional[str] = None
 
 
-T = TypeVar("T")
-
-
-@dataclass
-class Context(Generic[T]):
-    history: Optional[List[T]] = None
-    error: Optional[Exception] = None
-    data: Optional[T] = None
-
-    def push(self, data: Optional[T], error: Optional[Exception] = None):
-        if self.history is None:
-            self.history = []
-        if self.data:
-            self.history.append(self.data)
-        self.data = data
-        self.error = error
-
-    @contextmanager
-    def capture(self):
-        try:
-            yield
-        except Exception as e:
-            self.push(None, error=e)
-
-    @contextmanager
-    def assert_data(self, name=None) -> Iterator[T]:
-        if name is None:
-            name = "data"
-        assert self.data, f"No {name} computed yet"
-        yield self.data
-
-    @contextmanager
-    def assert_error(self) -> Iterator[Exception]:
-        assert self.error, f"Expected an error and found none"
-        yield self.error
-
-
 scenarios("features/article.feature")
 
 
diff --git a/tests/test_collection_cached.py b/tests/test_collection_cached.py
new file mode 100644
index 0000000..64344db
--- /dev/null
+++ b/tests/test_collection_cached.py
@@ -0,0 +1,137 @@
+import io
+from typing import Collection
+
+from pytest import fixture
+from pytest_bdd import scenario, given, when, then
+
+from wostools import CachedCollection
+from wostools._testutils import Context
+
+ISI_TEXT = """
+FN Thomson Reuters Web of Science™
+VR 1.0
+PT J
+AU Sun, ZW
+   Russell, TP
+AF Sun, Zhiwei
+   Russell, Thomas P.
+TI In situ grazing incidence small-angle X-ray scattering study of solvent
+   vapor annealing in lamellae-forming block copolymer thin films:
+   Trade-off of defects in deswelling
+SO JOURNAL OF POLYMER SCIENCE PART B-POLYMER PHYSICS
+LA English
+DT Article
+DE annealing; block copolymers; self-assembly; thin films; X-ray
+ID BIT-PATTERNED MEDIA; LITHOGRAPHY; GRAPHENE; ARRAYS; ORIENTATION;
+   NANOWIRES; PARALLEL; BEHAVIOR; INPLANE; DENSITY
+AB Solvent vapor annealing (SVA) is one route to prepare block copolymer (BCP) thin films with long-range lateral ordering. The lattice defects in the spin-coated BCP thin film can be effectively and rapidly reduced using SVA. The solvent evaporation after annealing was shown to have a significant impact on the in-plane ordering of BCP microdomains. However, the effect of solvent evaporation on the out-of-plane defects in BCPs has not been considered. Using grazing-incidence x-ray scattering, the morphology evolution of lamellae-forming poly(2-vinlypyridine)-b-polystyrene-b-poly(2vinylpyridine) triblock copolymers, having lamellar microdomains oriented normal to substrate surface during SVA, was studied in this work. A micelle to lamellae transformation was observed during solvent uptake. The influence of solvent swelling ratio and solvent removal rate on both the in-plane and out-of-plane defect density was studied. It shows that there is a trade-off between the in-plane and out-of-plane defect densities during solvent evaporation. (c) 2017 Wiley Periodicals, Inc. J. Polym. Sci., Part B: Polym. Phys. 2017, 55, 980-989
+C1 [Sun, Zhiwei; Russell, Thomas P.] Univ Massachusetts Amherst, Dept Polymer Sci & Engn, Amherst, MA 01003 USA.
+   [Russell, Thomas P.] Lawrence Berkeley Natl Lab, Div Mat Sci, Berkeley, CA 94720 USA.
+   [Russell, Thomas P.] Beijing Univ Chem Technol, Beijing Adv Innovat Ctr Soft Matter Sci & Engn, Beijing, Peoples R China.
+RP Russell, TP (reprint author), Univ Massachusetts Amherst, Dept Polymer Sci & Engn, Amherst, MA 01003 USA.; Russell, TP (reprint author), Lawrence Berkeley Natl Lab, Div Mat Sci, Berkeley, CA 94720 USA.; Russell, TP (reprint author), Beijing Univ Chem Technol, Beijing Adv Innovat Ctr Soft Matter Sci & Engn, Beijing, Peoples R China.
+EM russell@mail.pse.umass.edu
+FU U.S. Department of Energy BES [BES-DE-FG02-96ER45612]; Director of the
+   Office of Science, Office of Basic Energy Sciences, of the U.S.
+   Department of Energy [DE-AC02-05CH11231]; Office of Science, Office of
+   Basic Energy Sciences, of the U.S. Department of Energy
+   [DE-AC02-05CH11231]
+FX The authors acknowledge the facility support in Advanced Light Source
+   and Molecular Foundry in Lawrence Berkeley National Laboratory. This
+   work was supported by the U.S. Department of Energy BES under contract
+   BES-DE-FG02-96ER45612. The GISAXS characterization in beamline 7.3.3 of
+   the Advanced Light Source is supported by the Director of the Office of
+   Science, Office of Basic Energy Sciences, of the U.S. Department of
+   Energy under contract no. DE-AC02-05CH11231. The SEM and AFM
+   characterization in the Molecular Foundry was supported by the Office of
+   Science, Office of Basic Energy Sciences, of the U.S. Department of
+   Energy under contract no. DE-AC02-05CH11231.
+CR Bai W, 2015, MACROMOLECULES, V48, P8574, DOI 10.1021/acs.macromol.5b02174
+   Bosworth JK, 2011, MACROMOLECULES, V44, P9196, DOI 10.1021/ma201967a
+   Bosworth JK, 2010, J PHOTOPOLYM SCI TEC, V23, P145, DOI 10.2494/photopolymer.23.145
+   Chai J, 2008, ACS NANO, V2, P489, DOI 10.1021/nn700341s
+   Chai J, 2007, NAT NANOTECHNOL, V2, P500, DOI 10.1038/nnano.2007.227
+   Choi S, 2012, SOFT MATTER, V8, P3463, DOI 10.1039/c2sm07297a
+   Di ZY, 2012, MACROMOLECULES, V45, P5185, DOI 10.1021/ma3004136
+   Farrell RA, 2012, NANOSCALE, V4, P3228, DOI 10.1039/c2nr00018k
+   Gowd E. B., 2010, IOP C SER MAT SCI EN, V14
+   Gu XD, 2014, ADV MATER, V26, P273, DOI 10.1002/adma.201302562
+   Gunkel I, 2016, J POLYM SCI POL PHYS, V54, P331, DOI 10.1002/polb.23933
+   Ilavsky J, 2012, J APPL CRYSTALLOGR, V45, P324, DOI 10.1107/S0021889812004037
+   Jeong SJ, 2010, NANO LETT, V10, P3500, DOI 10.1021/nl101637f
+   Ji S, 2008, MACROMOLECULES, V41, P9098, DOI 10.1021/ma801861h
+   Khaira GS, 2014, ACS MACRO LETT, V3, P747, DOI 10.1021/mz5002349
+   Kikitsu A, 2013, IEEE T MAGN, V49, P693, DOI 10.1109/TMAG.2012.2226566
+   Kim BH, 2011, ADV MATER, V23, P5618, DOI 10.1002/adma.201103650
+   Kim BH, 2010, ACS NANO, V4, P5464, DOI 10.1021/nn101491g
+   Kurihara M, 2013, JPN J APPL PHYS, V52, DOI 10.7567/JJAP.52.086201
+   Liu GX, 2012, ACS NANO, V6, P6786, DOI 10.1021/nn301515a
+   Mahadevapuram N, 2016, J POLYM SCI POL PHYS, V54, P339, DOI 10.1002/polb.23937
+   Paik MY, 2010, MACROMOLECULES, V43, P4253, DOI 10.1021/ma902646t
+   Sinturel C, 2014, ACS APPL MATER INTER, V6, P12146, DOI 10.1021/am504086x
+   Sun ZW, 2015, ADV MATER, V27, P4364, DOI 10.1002/adma.201501585
+   Vu T, 2011, MACROMOLECULES, V44, P6121, DOI 10.1021/ma2009222
+   Thurn-Albrecht T, 2000, SCIENCE, V290, P2126, DOI 10.1126/science.290.5499.2126
+   Wan L., 2012, MOEMS, V11, P31405
+   Wang JY, 2008, LANGMUIR, V24, P3545, DOI 10.1021/la703559q
+   Xiao S., 2013, MOEMS, V12
+   Xiao SG, 2014, ACS NANO, V8, P11854, DOI 10.1021/nn505630t
+   Xiao SG, 2014, J POLYM SCI POL PHYS, V52, P361, DOI 10.1002/polb.23433
+   Yamamoto R, 2014, IEEE T MAGN, V50, DOI 10.1109/TMAG.2013.2284474
+   Yang X., 2014, MOEMS, V13
+   Yang X., 2013, J MATER RES, V2013, P1
+   Yang XM, 2014, NANOTECHNOLOGY, V25, DOI 10.1088/0957-4484/25/39/395301
+   Yang XM, 2009, ACS NANO, V3, P1844, DOI 10.1021/nn900073r
+   Zhang JQ, 2014, MACROMOLECULES, V47, P5711, DOI 10.1021/ma500633b
+NR 37
+TC 0
+Z9 0
+U1 1
+U2 1
+PU WILEY
+PI HOBOKEN
+PA 111 RIVER ST, HOBOKEN 07030-5774, NJ USA
+SN 0887-6266
+EI 1099-0488
+J9 J POLYM SCI POL PHYS
+JI J. Polym. Sci. Pt. B-Polym. Phys.
+PD JUL 1
+PY 2017
+VL 55
+IS 13
+BP 980
+EP 989
+DI 10.1002/polb.24346
+PG 10
+WC Polymer Science
+SC Polymer Science
+GA EU7BQ
+UT WOS:000401190100002
+ER
+""".strip()
+
+
+@scenario("features/cached.feature", "preheat cache")
+def test_preheat_cache():
+    pass
+
+
+@fixture
+def collection_context() -> Context[CachedCollection]:
+    return Context()
+
+
+@given("some valid isi text", target_fixture="isi_text")
+def valid_isi_text():
+    return ISI_TEXT
+
+
+@when("I create a collection from that text")
+def create_collection(isi_text, collection_context: Context[CachedCollection]):
+    with collection_context.capture():
+        collection_context.push(CachedCollection(io.StringIO(isi_text)))
+
+
+@then("the collection's cache is preheated")
+def the_collection_cache_is_preheated(collection_context: Context[CachedCollection]):
+    with collection_context.assert_data() as collection:
+        assert collection._cache
diff --git a/wostools/_testutils.py b/wostools/_testutils.py
new file mode 100644
index 0000000..2c5b94f
--- /dev/null
+++ b/wostools/_testutils.py
@@ -0,0 +1,39 @@
+from contextlib import contextmanager
+from dataclasses import dataclass
+from typing import Generic, Iterator, List, Optional, TypeVar
+
+T = TypeVar("T")
+
+
+@dataclass
+class Context(Generic[T]):
+    history: Optional[List[T]] = None
+    error: Optional[Exception] = None
+    data: Optional[T] = None
+
+    def push(self, data: Optional[T], error: Optional[Exception] = None):
+        if self.history is None:
+            self.history = []
+        if self.data:
+            self.history.append(self.data)
+        self.data = data
+        self.error = error
+
+    @contextmanager
+    def capture(self):
+        try:
+            yield
+        except Exception as e:
+            self.push(None, error=e)
+
+    @contextmanager
+    def assert_data(self, name=None) -> Iterator[T]:
+        if name is None:
+            name = "data"
+        assert self.data, f"No {name} computed yet"
+        yield self.data
+
+    @contextmanager
+    def assert_error(self) -> Iterator[Exception]:
+        assert self.error, f"Expected an error and found none"
+        yield self.error

From 583d4df3421de090546044634ae44df3b50896c3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oscar=20Arbel=C3=A1ez?= <odarbelaeze@gmail.com>
Date: Sun, 2 Aug 2020 12:56:49 -0500
Subject: [PATCH 22/35] Remove test code that's not required anymore

---
 tests/_test_wostools.py | 265 ----------------------------------------
 1 file changed, 265 deletions(-)
 delete mode 100644 tests/_test_wostools.py

diff --git a/tests/_test_wostools.py b/tests/_test_wostools.py
deleted file mode 100644
index 9cd307f..0000000
--- a/tests/_test_wostools.py
+++ /dev/null
@@ -1,265 +0,0 @@
-"""Tests for `wostools` package."""
-
-from click.testing import CliRunner
-
-from wostools import LazyCollection, CachedCollection
-from wostools import cli
-from wostools import Article
-import pytest
-import io
-
-
-def test_article_label(article):
-    """
-    Test label value of article.
-    """
-    assert article.label == (
-        "Wodarz S, 2017, J MAGN MAGN MATER, V430, P52, DOI 10.1016/j.jmmm.2017.01.061"
-    )
-
-
-def test_parsers(article):
-    assert article.extra["PT"] == "J"
-    assert article.authors == ["Wodarz, S", "Hasegawa, T", "Ishio, S", "Homma, T"]
-    assert article.extra["AF"] == [
-        "Wodarz, Siggi",
-        "Hasegawa, Takashi",
-        "Ishio, Shunji",
-        "Homma, Takayuki",
-    ]
-    assert (
-        article.title
-        == "Structural control of ultra-fine CoPt nanodot arrays via electrodeposition process"
-    )
-    assert article.extra["SO"] == "JOURNAL OF MAGNETISM AND MAGNETIC MATERIALS"
-
-
-def test_article_attributes(article):
-    assert set(article.extra.keys()).issuperset(
-        {
-            "PT",
-            "AU",
-            "AF",
-            "TI",
-            "SO",
-            "LA",
-            "DT",
-            "DE",
-            "ID",
-            "AB",
-            "C1",
-            "RP",
-            "EM",
-            "OI",
-            "FU",
-            "FX",
-            "CR",
-            "NR",
-            "TC",
-            "Z9",
-            "U1",
-            "U2",
-            "PU",
-            "PI",
-            "PA",
-            "SN",
-            "EI",
-            "J9",
-            "JI",
-            "PD",
-            "PY",
-            "VL",
-            "BP",
-            "EP",
-            "DI",
-            "PG",
-            "WC",
-            "SC",
-            "GA",
-            "UT",
-        }
-    )
-
-
-def test_article_extra(article):
-    data = article.extra
-    assert data.get("AB") == data.get("abstract")
-    assert data.get("AF") == data.get("author_full_names")
-    assert data.get("AR") == data.get("article_number")
-    assert data.get("AU") == data.get("authors")
-    assert data.get("BA") == data.get("book_authors")
-    assert data.get("BE") == data.get("editors")
-    assert data.get("BF") == data.get("book_authors_full_name")
-    assert data.get("BN") == data.get("international_standard_book_number")
-    assert data.get("BP") == data.get("beginning_page")
-    assert data.get("BS") == data.get("book_series_subtitle")
-    assert data.get("C1") == data.get("author_address")
-    assert data.get("CA") == data.get("group_authors")
-    assert data.get("CL") == data.get("conference_location")
-    assert data.get("CR") == data.get("cited_references")
-    assert data.get("CR") == data.get("references")
-    assert data.get("CR") == data.get("citations")
-    assert data.get("CT") == data.get("conference_title")
-    assert data.get("CY") == data.get("conference_date")
-    assert data.get("DE") == data.get("author_keywords")
-    assert data.get("DI") == data.get("digital_object_identifier")
-    assert data.get("DT") == data.get("document_type")
-    assert data.get("D2") == data.get("book_digital_object_identifier")
-    assert data.get("ED") == data.get("editors")
-    assert data.get("EM") == data.get("email_address")
-    assert data.get("EI") == data.get("eissn")
-    assert data.get("EP") == data.get("ending_page")
-    assert data.get("FU") == data.get("funding_agency_and_grant_number")
-    assert data.get("FX") == data.get("funding_text")
-    assert data.get("GA") == data.get("document_delivery_number")
-    assert data.get("GP") == data.get("book_group_authors")
-    assert data.get("HO") == data.get("conference_host")
-    assert data.get("ID") == data.get("keywords_plus")
-    assert data.get("ID") == data.get("keywords")
-    assert data.get("IS") == data.get("issue")
-    assert data.get("J9") == data.get("source_abbreviation")
-    assert data.get("JI") == data.get("iso_source_abbreviation")
-    assert data.get("LA") == data.get("language")
-    assert data.get("MA") == data.get("meeting_abstract")
-    assert data.get("NR") == data.get("cited_reference_count")
-    assert data.get("OI") == data.get("orcid_identifier")
-    assert data.get("P2") == data.get("chapter_count")
-    assert data.get("PA") == data.get("publisher_address")
-    assert data.get("PD") == data.get("publication_date")
-    assert data.get("PG") == data.get("page_count")
-    assert data.get("PI") == data.get("publisher_city")
-    assert data.get("PM") == data.get("pubmed_id")
-    assert data.get("PN") == data.get("part_number")
-    assert data.get("PT") == data.get("publication_type")
-    assert data.get("PU") == data.get("publisher")
-    assert data.get("PY") == data.get("year_published")
-    assert data.get("RI") == data.get("researcherid_number")
-    assert data.get("RP") == data.get("reprint_address")
-    assert data.get("SC") == data.get("research_areas")
-    assert data.get("SE") == data.get("book_series_title")
-    assert data.get("SI") == data.get("special_issue")
-    assert data.get("SN") == data.get("issn")
-    assert data.get("SP") == data.get("conference_sponsors")
-    assert data.get("SU") == data.get("supplement")
-    assert data.get("TC") == data.get("wos_times_cited_count")
-    assert data.get("TC") == data.get("wos_times_cited")
-    assert data.get("TI") == data.get("title")
-    assert data.get("U1") == data.get("usage_count")
-    assert data.get("U2") == data.get("usage_count")
-    assert data.get("UT") == data.get("unique_article_identifier")
-    assert data.get("VL") == data.get("volume")
-    assert data.get("WC") == data.get("web_of_science_categories")
-    assert data.get("Z9") == data.get("total_times_cited_count")
-    assert data.get("Z9") == data.get("times_cited")
-
-
-def test_article_properties(article):
-    assert isinstance(article.extra, dict)
-
-
-def test_collection_from_filenames(collection_many_documents):
-    for article in collection_many_documents:
-        assert isinstance(article, Article)
-
-    for file in collection_many_documents._files:
-        assert hasattr(file, "read")
-        assert isinstance(file, (io.StringIO, io.TextIOWrapper))
-        assert file.tell() == 0
-
-
-@pytest.mark.parametrize(
-    "cls,count", [(LazyCollection, 13892), (CachedCollection, 8797)]
-)
-def test_collection_from_glob(cls, count):
-    collection = cls.from_glob("docs/examples/*.txt")
-    for article in collection:
-        assert isinstance(article, Article)
-
-    assert len(list(collection)) == count
-
-    for file in collection._files:
-        assert hasattr(file, "read")
-        assert isinstance(file, (io.StringIO, io.TextIOWrapper))
-        assert file.tell() == 0
-
-
-def test_collection_from_streams(filename_single_document):
-    with open(filename_single_document) as file:
-        _ = file.read()
-
-        collection = LazyCollection(file)
-        for article in collection:
-            assert isinstance(article, Article)
-
-        for file in collection._files:
-            assert hasattr(file, "read")
-            assert isinstance(file, (io.StringIO, io.TextIOWrapper))
-            assert file.tell() == 0
-
-
-def test_collection_with_duplicated(filename_single_document, filename_many_documents):
-    collection = LazyCollection.from_filenames(filename_single_document)
-    assert len(list(collection._files)) == 1
-    assert len(list(collection)) == 29
-
-    collection = LazyCollection.from_filenames(
-        filename_single_document, filename_single_document, filename_single_document
-    )
-    assert len(list(collection._files)) == 3
-    assert len(list(collection)) == 3 * 29
-
-
-def test_cached_collection_with_duplicated(
-    filename_single_document, filename_many_documents
-):
-    collection = CachedCollection.from_filenames(filename_single_document)
-    assert len(list(collection._files)) == 1
-    assert len(list(collection)) == 29
-
-    collection = CachedCollection.from_filenames(
-        filename_single_document, filename_single_document
-    )
-    assert len(list(collection._files)) == 2
-    assert len(list(collection)) == 29
-
-
-def test_collection_authors(collection_single_document):
-    assert {"Wodarz, S", "Hasegawa, T", "Ishio, S", "Homma, T"}.issubset(
-        set(collection_single_document.authors)
-    )
-
-
-def test_collection_coauthors(collection_single_document):
-    coauthors = collection_single_document.coauthors
-    assert {
-        ("Hasegawa, T", "Homma, T"),
-        ("Hasegawa, T", "Ishio, S"),
-        ("Hasegawa, T", "Wodarz, S"),
-        ("Homma, T", "Ishio, S"),
-        ("Homma, T", "Wodarz, S"),
-        ("Ishio, S", "Wodarz, S"),
-    }.issubset(set(coauthors))
-
-
-def test_command_line_interface():
-    """Test the CLI."""
-    runner = CliRunner()
-    result = runner.invoke(cli.main)
-    assert result.exit_code == 0
-    assert "A little cli for wos tools" in result.output
-    help_result = runner.invoke(cli.main, ["--help"])
-    assert help_result.exit_code == 0
-    assert "--help  Show this message and exit." in help_result.output
-
-
-def test_command_line_interface_citation_pairs(filename_single_document):
-    runner = CliRunner()
-    result = runner.invoke(cli.citation_pairs)
-    assert result.exit_code == 0
-    assert "You should give at least a file with documents." in result.output
-
-    result = runner.invoke(cli.citation_pairs, filename_single_document)
-    assert (
-        "Wodarz S, 2017, J MAGN MAGN MATER, V430, P52, DOI 10.1016/j.jmmm.2017.01.061"
-        in result.output
-    )

From 9bc3dcd0114295885aab21b67af3ca380413d943 Mon Sep 17 00:00:00 2001
From: Juan David Alzate Cardona <jdalzatec@unal.edu.co>
Date: Fri, 7 Aug 2020 15:12:54 -0500
Subject: [PATCH 23/35] Refactor fixture to be used in other scenario

---
 tests/features/cached.feature   |  2 +-
 tests/test_collection_cached.py | 27 ++++++++++++++++++++++-----
 2 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/tests/features/cached.feature b/tests/features/cached.feature
index 85f8c4f..354996f 100644
--- a/tests/features/cached.feature
+++ b/tests/features/cached.feature
@@ -30,7 +30,7 @@ Feature: cached collection
 
    Scenario: duplicated articles are removed
 
-      Given somve valid isi text
+      Given some valid isi text
       When I create a collection from that text
       And I create a collection from twice that text
       Then both collections have the same number of articles
diff --git a/tests/test_collection_cached.py b/tests/test_collection_cached.py
index 64344db..4d136c9 100644
--- a/tests/test_collection_cached.py
+++ b/tests/test_collection_cached.py
@@ -120,15 +120,32 @@ def collection_context() -> Context[CachedCollection]:
     return Context()
 
 
-@given("some valid isi text", target_fixture="isi_text")
-def valid_isi_text():
+@fixture
+def isi_text():
     return ISI_TEXT
 
 
-@when("I create a collection from that text")
-def create_collection(isi_text, collection_context: Context[CachedCollection]):
+@given("some valid isi text")
+def valid_isi_text(isi_text):
+    return isi_text
+
+
+@fixture
+def create_valid_collection(isi_text, collection_context: Context[CachedCollection]):
     with collection_context.capture():
-        collection_context.push(CachedCollection(io.StringIO(isi_text)))
+        collection = CachedCollection(io.StringIO(isi_text))
+        collection_context.push(collection)
+    return collection_context
+
+
+@when("I create a collection from that text")
+def create_collection(create_valid_collection):
+    pass
+
+
+@given("a valid collection")
+def context_valid_collection(create_valid_collection):
+    return create_valid_collection
 
 
 @then("the collection's cache is preheated")

From 3cf4e6133842616168d4e45c713b0bfb3ef7792c Mon Sep 17 00:00:00 2001
From: Juan David Alzate Cardona <jdalzatec@unal.edu.co>
Date: Fri, 7 Aug 2020 15:14:11 -0500
Subject: [PATCH 24/35] Implement scenario

---
 tests/test_collection_cached.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/tests/test_collection_cached.py b/tests/test_collection_cached.py
index 4d136c9..0f3678e 100644
--- a/tests/test_collection_cached.py
+++ b/tests/test_collection_cached.py
@@ -110,7 +110,7 @@
 """.strip()
 
 
-@scenario("features/cached.feature", "preheat cache")
+@scenario("features/cached.feature", "collection list articles and references")
 def test_preheat_cache():
     pass
 
@@ -152,3 +152,13 @@ def context_valid_collection(create_valid_collection):
 def the_collection_cache_is_preheated(collection_context: Context[CachedCollection]):
     with collection_context.assert_data() as collection:
         assert collection._cache
+
+
+@when("I iterate over the collection")
+@then("all articles and references are present")
+def iterate_over_collection(context_valid_collection: Context[CachedCollection]):
+    with context_valid_collection.assert_data() as collection:
+        assert len(collection) == 38
+        for article in collection:
+            assert article
+            assert article.label

From a61abbe19e90eb405d85458993d9975441b01d2c Mon Sep 17 00:00:00 2001
From: Juan David Alzate Cardona <jdalzatec@unal.edu.co>
Date: Fri, 7 Aug 2020 15:18:44 -0500
Subject: [PATCH 25/35] Implement scenario

---
 tests/test_collection_cached.py | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/tests/test_collection_cached.py b/tests/test_collection_cached.py
index 0f3678e..8b2256e 100644
--- a/tests/test_collection_cached.py
+++ b/tests/test_collection_cached.py
@@ -1,5 +1,5 @@
 import io
-from typing import Collection
+from typing import Collection, Dict
 
 from pytest import fixture
 from pytest_bdd import scenario, given, when, then
@@ -110,7 +110,7 @@
 """.strip()
 
 
-@scenario("features/cached.feature", "collection list articles and references")
+@scenario("features/cached.feature", "list authors")
 def test_preheat_cache():
     pass
 
@@ -162,3 +162,22 @@ def iterate_over_collection(context_valid_collection: Context[CachedCollection])
         for article in collection:
             assert article
             assert article.label
+
+
+@when("I iterate over the collection authors")
+@then("all authors are included")
+@then("the author list include duplicates")
+def iterate_over_collection_authors(
+    context_valid_collection: Context[CachedCollection],
+):
+    with context_valid_collection.assert_data() as collection:
+        assert collection.authors
+
+        authors: Dict[str, int] = {}
+        for author in collection.authors:
+            authors[author] = authors.get(author, 0) + 1
+            assert author
+
+        for author, count in authors.items():
+            assert author in ISI_TEXT
+            assert count >= 1

From ec8fa946f2254c2f6dbc0cfc8790adc82df4c326 Mon Sep 17 00:00:00 2001
From: Juan David Alzate Cardona <jdalzatec@unal.edu.co>
Date: Fri, 7 Aug 2020 15:50:54 -0500
Subject: [PATCH 26/35] Implement scenario

---
 tests/test_collection_cached.py | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/tests/test_collection_cached.py b/tests/test_collection_cached.py
index 8b2256e..82b169d 100644
--- a/tests/test_collection_cached.py
+++ b/tests/test_collection_cached.py
@@ -1,5 +1,5 @@
 import io
-from typing import Collection, Dict
+from typing import Collection, Dict, Tuple
 
 from pytest import fixture
 from pytest_bdd import scenario, given, when, then
@@ -110,7 +110,7 @@
 """.strip()
 
 
-@scenario("features/cached.feature", "list authors")
+@scenario("features/cached.feature", "list coauthors")
 def test_preheat_cache():
     pass
 
@@ -181,3 +181,28 @@ def iterate_over_collection_authors(
         for author, count in authors.items():
             assert author in ISI_TEXT
             assert count >= 1
+
+
+@when("I iterate over the collection coauthors")
+@then("all coauthor pairs are included")
+@then("the coauthor list include duplicates")
+def iterate_over_collection_coauthors(
+    context_valid_collection: Context[CachedCollection],
+):
+    with context_valid_collection.assert_data() as collection:
+        assert collection.coauthors
+
+        coauthors: Dict[Tuple[str, str], int] = {}
+        for pair in collection.coauthors:
+            coauthors[pair] = coauthors.get(pair, 0) + 1
+
+            author, coauthor = pair
+            assert author
+            assert coauthor
+
+        for pair, count in coauthors.items():
+            author, coauthor = pair
+            assert author in ISI_TEXT
+            assert coauthor in ISI_TEXT
+            assert count >= 1
+

From 9c61407b77ad76dfd632d564f28d19b48ea3e448 Mon Sep 17 00:00:00 2001
From: Juan David Alzate Cardona <jdalzatec@unal.edu.co>
Date: Fri, 7 Aug 2020 15:55:19 -0500
Subject: [PATCH 27/35] Implement

---
 tests/test_collection_cached.py | 34 ++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/tests/test_collection_cached.py b/tests/test_collection_cached.py
index 82b169d..30c481b 100644
--- a/tests/test_collection_cached.py
+++ b/tests/test_collection_cached.py
@@ -110,7 +110,7 @@
 """.strip()
 
 
-@scenario("features/cached.feature", "list coauthors")
+@scenario("features/cached.feature", "duplicated articles are removed")
 def test_preheat_cache():
     pass
 
@@ -120,6 +120,11 @@ def collection_context() -> Context[CachedCollection]:
     return Context()
 
 
+@fixture
+def two_collections_context() -> Tuple[Context, Context]:
+    return Context(), Context()
+
+
 @fixture
 def isi_text():
     return ISI_TEXT
@@ -206,3 +211,30 @@ def iterate_over_collection_coauthors(
             assert coauthor in ISI_TEXT
             assert count >= 1
 
+
+@when("I create a collection from that text")
+@when("I create a collection from twice that text")
+def create_two_collections(isi_text, two_collections_context):
+    first_context, second_context = two_collections_context
+    buffer = io.StringIO(isi_text)
+
+    with first_context.capture():
+        first_collection = CachedCollection(buffer)
+        first_context.push(first_collection)
+
+    with second_context.capture():
+        second_collection = CachedCollection(buffer, buffer)
+        second_context.push(second_collection)
+
+
+@then("both collections have the same number of articles")
+def same_number_of_articles(two_collections_context):
+    first_context, second_context = two_collections_context
+
+    with first_context.assert_data() as first_collection:
+        with second_context.assert_data() as second_collection:
+            assert len(first_collection) == len(second_collection)
+            assert sorted([art.label for art in first_collection]) == sorted(
+                [art.label for art in second_collection]
+            )
+

From d22453e1da530122ab7f150c006b1890eb15720d Mon Sep 17 00:00:00 2001
From: Juan David Alzate Cardona <jdalzatec@unal.edu.co>
Date: Fri, 7 Aug 2020 21:51:59 -0500
Subject: [PATCH 28/35] Implement  test as well as solve little issue when
 merging articles

---
 tests/test_collection_cached.py | 13 +++++++++++--
 wostools/article.py             |  2 ++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/tests/test_collection_cached.py b/tests/test_collection_cached.py
index 30c481b..bb24189 100644
--- a/tests/test_collection_cached.py
+++ b/tests/test_collection_cached.py
@@ -4,7 +4,7 @@
 from pytest import fixture
 from pytest_bdd import scenario, given, when, then
 
-from wostools import CachedCollection
+from wostools import CachedCollection, Article
 from wostools._testutils import Context
 
 ISI_TEXT = """
@@ -110,7 +110,7 @@
 """.strip()
 
 
-@scenario("features/cached.feature", "duplicated articles are removed")
+@scenario("features/cached.feature", "citation pairs")
 def test_preheat_cache():
     pass
 
@@ -238,3 +238,12 @@ def same_number_of_articles(two_collections_context):
                 [art.label for art in second_collection]
             )
 
+
+@when("I list the collection's citation pairs")
+@then("all citation pairs are included")
+def list_collection_citation_pairs(context_valid_collection: Context[CachedCollection]):
+    with context_valid_collection.assert_data() as collection:
+        assert len(list(collection.citation_pairs())) == 37
+        for article, reference in collection.citation_pairs():
+            assert isinstance(article, Article)
+            assert isinstance(reference, Article)
diff --git a/wostools/article.py b/wostools/article.py
index 98da0aa..e66c11f 100644
--- a/wostools/article.py
+++ b/wostools/article.py
@@ -104,6 +104,8 @@ def merge(self, other: "Article") -> "Article":
             doi=self.doi or other.doi,
             sources={*self.sources, *other.sources},
             extra={**self.extra, **other.extra},
+            references=list({*self.references, *other.references}),
+            keywords=list({*self.keywords, *other.keywords}),
         )
 
     @classmethod

From 6056ac53a6d4413a3f122201674bae0901c55760 Mon Sep 17 00:00:00 2001
From: Juan David Alzate Cardona <jdalzatec@unal.edu.co>
Date: Fri, 7 Aug 2020 23:05:54 -0500
Subject: [PATCH 29/35] Implement  tests and all the tests are running
 correctly

---
 tests/features/cached.feature   |   4 +-
 tests/test_collection_cached.py | 148 +++++++++++++++++++++++++++++---
 2 files changed, 136 insertions(+), 16 deletions(-)

diff --git a/tests/features/cached.feature b/tests/features/cached.feature
index 354996f..3512be9 100644
--- a/tests/features/cached.feature
+++ b/tests/features/cached.feature
@@ -43,8 +43,8 @@ Feature: cached collection
 
    Scenario: citation pairs include complete info from references
 
-      Given some valid isi record
+      Given some valid isi text
       And a diferent isi record that references the former
       When I create a collection from that text
-      And I list the collection's citation pairs
+      And I list the collection's citation pairs [2]
       Then the citation always include all the available data
\ No newline at end of file
diff --git a/tests/test_collection_cached.py b/tests/test_collection_cached.py
index bb24189..46118a7 100644
--- a/tests/test_collection_cached.py
+++ b/tests/test_collection_cached.py
@@ -2,7 +2,7 @@
 from typing import Collection, Dict, Tuple
 
 from pytest import fixture
-from pytest_bdd import scenario, given, when, then
+from pytest_bdd import scenarios, given, when, then
 
 from wostools import CachedCollection, Article
 from wostools._testutils import Context
@@ -107,12 +107,95 @@
 GA EU7BQ
 UT WOS:000401190100002
 ER
+
+EF
 """.strip()
 
+ISI_TEXT_DIFFERENT_RECORD = """
+FN Thomson Reuters Web of Science™
+VR 1.0
+PT J
+AU Bosworth, JK
+   Dobisz, EA
+   Hellwig, O
+   Ruiz, R
+AF Bosworth, Joan K.
+   Dobisz, Elizabeth A.
+   Hellwig, Olav
+   Ruiz, Ricardo
+TI Impact of Out-of-Plane Translational Order in Block Copolymer
+   Lithography
+SO MACROMOLECULES
+LA English
+DT Article
+ID BIT-PATTERNED MEDIA; DENSITY MULTIPLICATION; TERNARY BLENDS; THIN-FILMS;
+   DIMENSIONS; ROUGHNESS; DOMAINS; SHAPES
+AB In block copolymer lithography, subtle distortions in the self-assembled domains, such as tilting or bending, have a strong impact on the quality of the lithographic features upon pattern transfer. We compared the feature size distribution observed at the top-surface of block copolymer thin films with the size distribution that the self-assembled structures project at the substrate interface, i.e., the lithographic image. We performed the comparison for films of perpendicularly oriented cylindrical block copolymer domains with various degrees of lateral order. We found that the size distribution of the projected image does not mimic the well-known Gaussian distribution observed at the top surface. Instead, the lithographic features display a skewed distribution with a long tail toward smaller feature dimensions, a shift of the median and a reduced number of transferred features. The distortions are more pronounced for films with shorter correlation lengths. We propose a simplified model that explains the observed shifts in the size distribution of the projected image by considering the tilting that cylinders undergo in the vicinity of dislocations. The presence of defects disrupting the in-plane orientational order riot only impacts the size distribution of the self-assembled features, but also induces nearby cylinder tilting and some general loss of out-of-plane translational order which, upon pattern transfer, is responsible for the observed distortions on the feature size distribution,
+C1 [Bosworth, Joan K.; Dobisz, Elizabeth A.; Hellwig, Olav; Ruiz, Ricardo] Hitachi Global Storage Technol, San Jose Res Ctr, San Jose, CA 95135 USA.
+RP Ruiz, R (reprint author), Hitachi Global Storage Technol, San Jose Res Ctr, 3403 Yerba Buena Rd, San Jose, CA 95135 USA.
+EM ricardo.ruiz@hitachigst.com
+OI Ruiz, Ricardo/0000-0002-1698-4281
+CR ALBRECHT T, 2009, NANOSCALE MAGNETIC M
+   BATES FS, 1990, ANNU REV PHYS CHEM, V41, P525, DOI 10.1146/annurev.pc.41.100190.002521
+   Black CT, 2007, IBM J RES DEV, V51, P605
+   Cheng JY, 2008, ADV MATER, V20, P3155, DOI 10.1002/adma.200800826
+   Cheng JY, 2010, ACS NANO, V4, P4815, DOI 10.1021/nn100686v
+   Detcheverry FA, 2010, MACROMOLECULES, V43, P3446, DOI 10.1021/ma902332h
+   Edwards EW, 2007, MACROMOLECULES, V40, P90, DOI 10.1021/ma0607564
+   Guarini KW, 2002, ADV MATER, V14, P1290, DOI 10.1002/1521-4095(20020916)14:18<1290::AID-ADMA1290>3.0.CO;2-N
+   Hammond MR, 2003, MACROMOLECULES, V36, P8712, DOI 10.1021/ma026001o
+   Harrison C, 2004, EUROPHYS LETT, V67, P800, DOI 10.1209/epl/i2004-10126-5
+   Harrison C, 2002, PHYS REV E, V66, DOI 10.1103/PhysRevE.66.011706
+   Hellwig O, 2010, APPL PHYS LETT, V96, DOI 10.1063/1.3293301
+   HO CS, 1983, IEEE T PATTERN ANAL, V5, P593
+   *INTRS, LITH
+   Ji SX, 2011, MACROMOLECULES, V44, P4291, DOI 10.1021/ma2005734
+   Kleman M., 2003, SOFT MATTER PHYS INT
+   LIU CC, 2010, J VAC SCI TECHNOL B, V34
+   Liu G, 2010, J VAC SCI TECHNOL B, V28
+   Nagpal U, 2011, ACS NANO, V5, P5673, DOI 10.1021/nn201335v
+   Ruiz R, 2008, PHYS REV B, V77, DOI 10.1103/PhysRevB.77.054204
+   Ruiz R, 2008, SCIENCE, V321, P936, DOI 10.1126/science.1157626
+   Segalman RA, 2005, MAT SCI ENG R, V48, P191, DOI 10.1016/j.mser.2004.12.003
+   Segalman RA, 2003, PHYS REV LETT, V91, DOI 10.1103/PhysRevLett.91.196101
+   Segalman RA, 2003, MACROMOLECULES, V36, P3272, DOI 10.1021/ma021367m
+   Stipe BC, 2010, NAT PHOTONICS, V4, P484, DOI 10.1038/nphoton.2010.90
+   Stoykovich MP, 2010, MACROMOLECULES, V43, P2334, DOI 10.1021/ma902494v
+   Stuen KO, 2009, MACROMOLECULES, V42, P5139, DOI 10.1021/ma900520v
+   Tada Y, 2009, POLYMER, V50, P4250, DOI 10.1016/j.polymer.2009.06.039
+   Welander AM, 2008, MACROMOLECULES, V41, P2759, DOI 10.1021/ma800056s
+   Welander AM, 2008, J VAC SCI TECHNOL B, V26, P2484, DOI 10.1116/1.2987963
+   Xiao SG, 2007, J VAC SCI TECHNOL B, V25, P1953, DOI 10.1116/1.2801860
+   Yang XM, 2009, ACS NANO, V3, P1844, DOI 10.1021/nn900073r
+NR 32
+TC 11
+Z9 11
+U1 4
+U2 22
+PU AMER CHEMICAL SOC
+PI WASHINGTON
+PA 1155 16TH ST, NW, WASHINGTON, DC 20036 USA
+SN 0024-9297
+J9 MACROMOLECULES
+JI Macromolecules
+PD DEC 13
+PY 2011
+VL 44
+IS 23
+BP 9196
+EP 9204
+DI 10.1021/ma201967a
+PG 9
+WC Polymer Science
+SC Polymer Science
+GA 855ZG
+UT WOS:000297604200016
+ER
+
+EF
+""".strip()
 
-@scenario("features/cached.feature", "citation pairs")
-def test_preheat_cache():
-    pass
+scenarios("features/cached.feature")
 
 
 @fixture
@@ -135,6 +218,11 @@ def valid_isi_text(isi_text):
     return isi_text
 
 
+@given("a diferent isi record that references the former")
+def isi_text_different_record():
+    return ISI_TEXT_DIFFERENT_RECORD
+
+
 @fixture
 def create_valid_collection(isi_text, collection_context: Context[CachedCollection]):
     with collection_context.capture():
@@ -161,8 +249,8 @@ def the_collection_cache_is_preheated(collection_context: Context[CachedCollecti
 
 @when("I iterate over the collection")
 @then("all articles and references are present")
-def iterate_over_collection(context_valid_collection: Context[CachedCollection]):
-    with context_valid_collection.assert_data() as collection:
+def iterate_over_collection(collection_context: Context[CachedCollection]):
+    with collection_context.assert_data() as collection:
         assert len(collection) == 38
         for article in collection:
             assert article
@@ -172,10 +260,8 @@ def iterate_over_collection(context_valid_collection: Context[CachedCollection])
 @when("I iterate over the collection authors")
 @then("all authors are included")
 @then("the author list include duplicates")
-def iterate_over_collection_authors(
-    context_valid_collection: Context[CachedCollection],
-):
-    with context_valid_collection.assert_data() as collection:
+def iterate_over_collection_authors(collection_context: Context[CachedCollection]):
+    with collection_context.assert_data() as collection:
         assert collection.authors
 
         authors: Dict[str, int] = {}
@@ -191,10 +277,8 @@ def iterate_over_collection_authors(
 @when("I iterate over the collection coauthors")
 @then("all coauthor pairs are included")
 @then("the coauthor list include duplicates")
-def iterate_over_collection_coauthors(
-    context_valid_collection: Context[CachedCollection],
-):
-    with context_valid_collection.assert_data() as collection:
+def iterate_over_collection_coauthors(collection_context: Context[CachedCollection]):
+    with collection_context.assert_data() as collection:
         assert collection.coauthors
 
         coauthors: Dict[Tuple[str, str], int] = {}
@@ -247,3 +331,39 @@ def list_collection_citation_pairs(context_valid_collection: Context[CachedColle
         for article, reference in collection.citation_pairs():
             assert isinstance(article, Article)
             assert isinstance(reference, Article)
+
+
+@when("I create a collection from that text")
+def create_collection_two_isi_files(
+    isi_text: str,
+    isi_text_different_record: str,
+    collection_context: Context[CachedCollection],
+):
+    buffer_1 = io.StringIO(isi_text)
+    buffer_2 = io.StringIO(isi_text_different_record)
+
+    with collection_context.capture():
+        collection = CachedCollection(buffer_1, buffer_2)
+        collection_context.push(collection)
+
+
+@when("I list the collection's citation pairs [2]")
+@then("the citation always include all the available data")
+def iterate_over_citation_pairs_two_isi_files(
+    collection_context: Context[CachedCollection],
+):
+    with collection_context.assert_data() as collection:
+        assert len(list(collection.citation_pairs())) == 68
+
+        having_keywords = False
+        for article, reference in collection.citation_pairs():
+            assert isinstance(article, Article)
+            assert isinstance(reference, Article)
+
+            if (
+                article.to_dict()["doi"] == "10.1002/polb.24346"
+                and reference.to_dict()["doi"] == "10.1021/ma201967a"
+            ):
+                having_keywords = bool(article.keywords and reference.keywords)
+
+        assert having_keywords

From 42c6cb7258232f32d7307ad79a0606ce76d51883 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oscar=20Arbel=C3=A1ez?= <odarbelaeze@gmail.com>
Date: Sat, 8 Aug 2020 12:07:58 -0500
Subject: [PATCH 30/35] Fix some of the tests

---
 tests/features/cached.feature   | 50 ++++++++++++++--------------
 tests/test_collection_cached.py | 58 +++++++++++++++++++++------------
 2 files changed, 63 insertions(+), 45 deletions(-)

diff --git a/tests/features/cached.feature b/tests/features/cached.feature
index 3512be9..1387b73 100644
--- a/tests/features/cached.feature
+++ b/tests/features/cached.feature
@@ -14,37 +14,37 @@ Feature: cached collection
       When I iterate over the collection
       Then all articles and references are present
 
-   Scenario: list authors
+# Scenario: list authors
 
-      Given a valid collection
-      When I iterate over the collection authors
-      Then all authors are included
-      And the author list include duplicates
+#    Given a valid collection
+#    When I iterate over the collection authors
+#    Then all authors are included
+#    And the author list include duplicates
 
-   Scenario: list coauthors
+# Scenario: list coauthors
 
-      Given a valid collection
-      When I iterate over the collection coauthors
-      Then all coauthor pairs are included
-      And the coauthor list include duplicates
+#    Given a valid collection
+#    When I iterate over the collection coauthors
+#    Then all coauthor pairs are included
+#    And the coauthor list include duplicates
 
-   Scenario: duplicated articles are removed
+# Scenario: duplicated articles are removed
 
-      Given some valid isi text
-      When I create a collection from that text
-      And I create a collection from twice that text
-      Then both collections have the same number of articles
+#    Given some valid isi text
+#    When I create a collection from that text
+#    And I create a collection from twice that text
+#    Then both collections have the same number of articles
 
-   Scenario: citation pairs
+# Scenario: citation pairs
 
-      Given a valid collection
-      When I list the collection's citation pairs
-      Then all citation pairs are included
+#    Given a valid collection
+#    When I list the collection's citation pairs
+#    Then all citation pairs are included
 
-   Scenario: citation pairs include complete info from references
+# Scenario: citation pairs include complete info from references
 
-      Given some valid isi text
-      And a diferent isi record that references the former
-      When I create a collection from that text
-      And I list the collection's citation pairs [2]
-      Then the citation always include all the available data
\ No newline at end of file
+#    Given some valid isi text
+#    And a diferent isi record that references the former
+#    When I create a collection from that text
+#    And I list the collection's citation pairs
+#    Then the citation always include all the available data
\ No newline at end of file
diff --git a/tests/test_collection_cached.py b/tests/test_collection_cached.py
index 46118a7..7aa5c0e 100644
--- a/tests/test_collection_cached.py
+++ b/tests/test_collection_cached.py
@@ -1,5 +1,5 @@
 import io
-from typing import Collection, Dict, Tuple
+from typing import List, Dict, Tuple
 
 from pytest import fixture
 from pytest_bdd import scenarios, given, when, then
@@ -203,19 +203,24 @@ def collection_context() -> Context[CachedCollection]:
     return Context()
 
 
+@fixture
+def iterate_collection_context() -> Context[List[Article]]:
+    return Context()
+
+
 @fixture
 def two_collections_context() -> Tuple[Context, Context]:
     return Context(), Context()
 
 
-@fixture
-def isi_text():
+@given("some valid isi text", target_fixture="isi_text")
+def valid_isi_text():
     return ISI_TEXT
 
 
-@given("some valid isi text")
-def valid_isi_text(isi_text):
-    return isi_text
+@given("some invalid isi text", target_fixture="isi_text")
+def invalid_isi_text():
+    return "INVALID invalid"
 
 
 @given("a diferent isi record that references the former")
@@ -224,21 +229,23 @@ def isi_text_different_record():
 
 
 @fixture
-def create_valid_collection(isi_text, collection_context: Context[CachedCollection]):
+def create_valid_collection(collection_context: Context[CachedCollection]):
+    collection = CachedCollection(io.StringIO(ISI_TEXT))
+    collection_context.push(collection)
+
+
+@when("I create a collection from that text")
+def create_collection(isi_text):
     with collection_context.capture():
         collection = CachedCollection(io.StringIO(isi_text))
         collection_context.push(collection)
     return collection_context
 
 
-@when("I create a collection from that text")
-def create_collection(create_valid_collection):
-    pass
-
-
 @given("a valid collection")
-def context_valid_collection(create_valid_collection):
-    return create_valid_collection
+def context_valid_collection(collection_context):
+    collection = CachedCollection(io.StringIO(ISI_TEXT))
+    collection_context.push(collection)
 
 
 @then("the collection's cache is preheated")
@@ -248,11 +255,22 @@ def the_collection_cache_is_preheated(collection_context: Context[CachedCollecti
 
 
 @when("I iterate over the collection")
-@then("all articles and references are present")
-def iterate_over_collection(collection_context: Context[CachedCollection]):
+def iterate_over_collection(
+    collection_context: Context[CachedCollection],
+    iterate_collection_context: Context[List[Article]],
+):
     with collection_context.assert_data() as collection:
-        assert len(collection) == 38
-        for article in collection:
+        with iterate_collection_context.capture():
+            iterate_collection_context.push(list(collection))
+
+
+@then("all articles and references are present")
+def all_articles_and_references_are_present(
+    iterate_collection_context: Context[List[Article]],
+):
+    with iterate_collection_context.assert_data() as articles:
+        assert len(articles) == 38
+        for article in articles:
             assert article
             assert article.label
 
@@ -325,8 +343,8 @@ def same_number_of_articles(two_collections_context):
 
 @when("I list the collection's citation pairs")
 @then("all citation pairs are included")
-def list_collection_citation_pairs(context_valid_collection: Context[CachedCollection]):
-    with context_valid_collection.assert_data() as collection:
+def list_collection_citation_pairs(collection_context: Context[CachedCollection]):
+    with collection_context.assert_data() as collection:
         assert len(list(collection.citation_pairs())) == 37
         for article, reference in collection.citation_pairs():
             assert isinstance(article, Article)

From c0058ca79bd82b942e32d89eeb9cf6a127c000e5 Mon Sep 17 00:00:00 2001
From: Juan David Alzate Cardona <jdalzatec@unal.edu.co>
Date: Sat, 8 Aug 2020 12:31:30 -0500
Subject: [PATCH 31/35] Refactor the tests based on the @odarbelaeze
 suggestions and corrections

---
 tests/features/cached.feature   | 50 ++++++++---------
 tests/test_collection_cached.py | 97 +++++++++++++++++++++++++--------
 2 files changed, 99 insertions(+), 48 deletions(-)

diff --git a/tests/features/cached.feature b/tests/features/cached.feature
index 1387b73..83af654 100644
--- a/tests/features/cached.feature
+++ b/tests/features/cached.feature
@@ -14,37 +14,37 @@ Feature: cached collection
       When I iterate over the collection
       Then all articles and references are present
 
-# Scenario: list authors
+Scenario: list authors
 
-#    Given a valid collection
-#    When I iterate over the collection authors
-#    Then all authors are included
-#    And the author list include duplicates
+   Given a valid collection
+   When I iterate over the collection authors
+   Then all authors are included
+   And the author list include duplicates
 
-# Scenario: list coauthors
+Scenario: list coauthors
 
-#    Given a valid collection
-#    When I iterate over the collection coauthors
-#    Then all coauthor pairs are included
-#    And the coauthor list include duplicates
+   Given a valid collection
+   When I iterate over the collection coauthors
+   Then all coauthor pairs are included
+   And the coauthor list include duplicates
 
-# Scenario: duplicated articles are removed
+Scenario: duplicated articles are removed
 
-#    Given some valid isi text
-#    When I create a collection from that text
-#    And I create a collection from twice that text
-#    Then both collections have the same number of articles
+   Given some valid isi text
+   When I create a collection from that text
+   And I create a collection from twice that text
+   Then both collections have the same number of articles
 
-# Scenario: citation pairs
+Scenario: citation pairs
 
-#    Given a valid collection
-#    When I list the collection's citation pairs
-#    Then all citation pairs are included
+   Given a valid collection
+   When I list the collection's citation pairs
+   Then all citation pairs are included
 
-# Scenario: citation pairs include complete info from references
+Scenario: citation pairs include complete info from references
 
-#    Given some valid isi text
-#    And a diferent isi record that references the former
-#    When I create a collection from that text
-#    And I list the collection's citation pairs
-#    Then the citation always include all the available data
\ No newline at end of file
+   Given some valid isi text
+   And a diferent isi record that references the former
+   When I create a collection from that text
+   And I list the collection's citation pairs
+   Then the citation always include all the available data
\ No newline at end of file
diff --git a/tests/test_collection_cached.py b/tests/test_collection_cached.py
index 7aa5c0e..71125db 100644
--- a/tests/test_collection_cached.py
+++ b/tests/test_collection_cached.py
@@ -208,6 +208,23 @@ def iterate_collection_context() -> Context[List[Article]]:
     return Context()
 
 
+@fixture
+def iterate_authors_collection_context() -> Context[List[str]]:
+    return Context()
+
+
+@fixture
+def iterate_coauthors_collection_context() -> Context[List[Tuple[str, str]]]:
+    return Context()
+
+
+@fixture
+def iterate_citation_pairs_collection_context() -> Context[
+    List[Tuple[Article, Article]]
+]:
+    return Context()
+
+
 @fixture
 def two_collections_context() -> Tuple[Context, Context]:
     return Context(), Context()
@@ -276,38 +293,60 @@ def all_articles_and_references_are_present(
 
 
 @when("I iterate over the collection authors")
+def iterate_over_collection_authors(
+    collection_context: Context[CachedCollection],
+    iterate_authors_collection_context: Context[List[str]],
+):
+    with collection_context.assert_data() as collection:
+        with iterate_authors_collection_context.capture():
+            iterate_authors_collection_context.push(list(collection.authors))
+
+
 @then("all authors are included")
 @then("the author list include duplicates")
-def iterate_over_collection_authors(collection_context: Context[CachedCollection]):
-    with collection_context.assert_data() as collection:
-        assert collection.authors
+def all_authors_included_even_duplicates(
+    iterate_authors_collection_context: Context[List[str]],
+):
+    with iterate_authors_collection_context.assert_data() as authors:
+        assert authors
 
-        authors: Dict[str, int] = {}
-        for author in collection.authors:
-            authors[author] = authors.get(author, 0) + 1
+        authors_count: Dict[str, int] = {}
+        for author in authors:
+            authors_count[author] = authors_count.get(author, 0) + 1
             assert author
 
-        for author, count in authors.items():
+        for author, count in authors_count.items():
             assert author in ISI_TEXT
             assert count >= 1
 
 
 @when("I iterate over the collection coauthors")
+def iterate_over_collection_coauthors(
+    collection_context: Context[CachedCollection],
+    iterate_coauthors_collection_context: Context[List[Tuple[str, str]]],
+):
+    with collection_context.assert_data() as collection:
+        with iterate_coauthors_collection_context.capture():
+            iterate_coauthors_collection_context.push(list(collection.coauthors))
+
+
 @then("all coauthor pairs are included")
 @then("the coauthor list include duplicates")
-def iterate_over_collection_coauthors(collection_context: Context[CachedCollection]):
-    with collection_context.assert_data() as collection:
-        assert collection.coauthors
+def all_coauthors_pairs_included_even_duplicates(
+    iterate_coauthors_collection_context: Context[List[Tuple[str, str]]],
+):
+    with iterate_coauthors_collection_context.assert_data() as coauthors:
+        assert coauthors
 
-        coauthors: Dict[Tuple[str, str], int] = {}
-        for pair in collection.coauthors:
-            coauthors[pair] = coauthors.get(pair, 0) + 1
+        coauthors_count: Dict[Tuple[str, str], int] = {}
+        for pair in coauthors:
+            coauthors_count[pair] = coauthors_count.get(pair, 0) + 1
 
             author, coauthor = pair
             assert author
             assert coauthor
 
-        for pair, count in coauthors.items():
+        for pair, count in coauthors_count.items():
             author, coauthor = pair
             assert author in ISI_TEXT
             assert coauthor in ISI_TEXT
@@ -342,11 +381,24 @@ def same_number_of_articles(two_collections_context):
 
 
 @when("I list the collection's citation pairs")
-@then("all citation pairs are included")
-def list_collection_citation_pairs(collection_context: Context[CachedCollection]):
+def list_collection_citation_pairs(
+    collection_context: Context[CachedCollection],
+    iterate_citation_pairs_collection_context: Context[List[Tuple[Article, Article]]],
+):
     with collection_context.assert_data() as collection:
-        assert len(list(collection.citation_pairs())) == 37
-        for article, reference in collection.citation_pairs():
+        with iterate_citation_pairs_collection_context.capture():
+            iterate_citation_pairs_collection_context.push(
+                list(collection.citation_pairs())
+            )
+
+
+@then("all citation pairs are included")
+def all_citation_pairs_are_included(
+    iterate_citation_pairs_collection_context: Context[List[Tuple[Article, Article]]]
+):
+    with iterate_citation_pairs_collection_context.assert_data() as citation_pairs:
+        assert len(citation_pairs) == 37
+        for article, reference in citation_pairs:
             assert isinstance(article, Article)
             assert isinstance(reference, Article)
 
@@ -365,16 +417,15 @@ def create_collection_two_isi_files(
         collection_context.push(collection)
 
 
-@when("I list the collection's citation pairs [2]")
 @then("the citation always include all the available data")
 def iterate_over_citation_pairs_two_isi_files(
-    collection_context: Context[CachedCollection],
+    iterate_citation_pairs_collection_context: Context[List[Tuple[Article, Article]]]
 ):
-    with collection_context.assert_data() as collection:
-        assert len(list(collection.citation_pairs())) == 68
+    with iterate_citation_pairs_collection_context.assert_data() as citation_pairs:
+        assert len(citation_pairs) == 68
 
         having_keywords = False
-        for article, reference in collection.citation_pairs():
+        for article, reference in citation_pairs:
             assert isinstance(article, Article)
             assert isinstance(reference, Article)
 

From 2599f1a05fd11ec8de09900f7f3fe51511bbd1ff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oscar=20Arbel=C3=A1ez?= <odarbelaeze@gmail.com>
Date: Sat, 8 Aug 2020 13:03:55 -0500
Subject: [PATCH 32/35] Clean up some extra test things

---
 tests/features/cached.feature   | 50 +++++++++++-----------
 tests/test_collection_cached.py | 74 ++++++---------------------------
 wostools/_testutils.py          |  5 +++
 3 files changed, 43 insertions(+), 86 deletions(-)

diff --git a/tests/features/cached.feature b/tests/features/cached.feature
index 83af654..7aebb9a 100644
--- a/tests/features/cached.feature
+++ b/tests/features/cached.feature
@@ -14,37 +14,37 @@ Feature: cached collection
       When I iterate over the collection
       Then all articles and references are present
 
-Scenario: list authors
+   Scenario: list authors
 
-   Given a valid collection
-   When I iterate over the collection authors
-   Then all authors are included
-   And the author list include duplicates
+      Given a valid collection
+      When I iterate over the collection authors
+      Then all authors are included
+      And the author list include duplicates
 
-Scenario: list coauthors
+   Scenario: list coauthors
 
-   Given a valid collection
-   When I iterate over the collection coauthors
-   Then all coauthor pairs are included
-   And the coauthor list include duplicates
+      Given a valid collection
+      When I iterate over the collection coauthors
+      Then all coauthor pairs are included
+      And the coauthor list include duplicates
 
-Scenario: duplicated articles are removed
+   Scenario: duplicated articles are removed
 
-   Given some valid isi text
-   When I create a collection from that text
-   And I create a collection from twice that text
-   Then both collections have the same number of articles
+      Given some valid isi text
+      When I create a collection from that text
+      And I create a collection from that text
+      Then both collections have the same number of articles
 
-Scenario: citation pairs
+   Scenario: citation pairs
 
-   Given a valid collection
-   When I list the collection's citation pairs
-   Then all citation pairs are included
+      Given a valid collection
+      When I list the collection's citation pairs
+      Then all citation pairs are included
 
-Scenario: citation pairs include complete info from references
+   Scenario: citation pairs include complete info from references
 
-   Given some valid isi text
-   And a diferent isi record that references the former
-   When I create a collection from that text
-   And I list the collection's citation pairs
-   Then the citation always include all the available data
\ No newline at end of file
+      Given some valid isi text
+      And a diferent isi record that references the former
+      When I create a collection from that text
+      And I list the collection's citation pairs
+      Then the citation always include all the available data
\ No newline at end of file
diff --git a/tests/test_collection_cached.py b/tests/test_collection_cached.py
index 71125db..0ef0530 100644
--- a/tests/test_collection_cached.py
+++ b/tests/test_collection_cached.py
@@ -1,5 +1,5 @@
 import io
-from typing import List, Dict, Tuple
+from typing import Collection, List, Dict, Tuple
 
 from pytest import fixture
 from pytest_bdd import scenarios, given, when, then
@@ -225,36 +225,20 @@ def iterate_citation_pairs_collection_context() -> Context[
     return Context()
 
 
-@fixture
-def two_collections_context() -> Tuple[Context, Context]:
-    return Context(), Context()
-
-
 @given("some valid isi text", target_fixture="isi_text")
 def valid_isi_text():
-    return ISI_TEXT
-
+    return [ISI_TEXT]
 
-@given("some invalid isi text", target_fixture="isi_text")
-def invalid_isi_text():
-    return "INVALID invalid"
 
-
-@given("a diferent isi record that references the former")
-def isi_text_different_record():
-    return ISI_TEXT_DIFFERENT_RECORD
-
-
-@fixture
-def create_valid_collection(collection_context: Context[CachedCollection]):
-    collection = CachedCollection(io.StringIO(ISI_TEXT))
-    collection_context.push(collection)
+@given("a diferent isi record that references the former", target_fixture="isi_text")
+def isi_text_different_record(isi_text):
+    return [*isi_text, ISI_TEXT_DIFFERENT_RECORD]
 
 
 @when("I create a collection from that text")
-def create_collection(isi_text):
+def create_collection(isi_text, collection_context: Context[CachedCollection]):
     with collection_context.capture():
-        collection = CachedCollection(io.StringIO(isi_text))
+        collection = CachedCollection(*(io.StringIO(doc) for doc in isi_text))
         collection_context.push(collection)
     return collection_context
 
@@ -353,31 +337,13 @@ def all_coauthors_pairs_included_even_duplicates(
             assert count >= 1
 
 
-@when("I create a collection from that text")
-@when("I create a collection from twice that text")
-def create_two_collections(isi_text, two_collections_context):
-    first_context, second_context = two_collections_context
-    buffer = io.StringIO(isi_text)
-
-    with first_context.capture():
-        first_collection = CachedCollection(buffer)
-        first_context.push(first_collection)
-
-    with second_context.capture():
-        second_collection = CachedCollection(buffer, buffer)
-        second_context.push(second_collection)
-
-
 @then("both collections have the same number of articles")
-def same_number_of_articles(two_collections_context):
-    first_context, second_context = two_collections_context
-
-    with first_context.assert_data() as first_collection:
-        with second_context.assert_data() as second_collection:
-            assert len(first_collection) == len(second_collection)
-            assert sorted([art.label for art in first_collection]) == sorted(
-                [art.label for art in second_collection]
-            )
+def same_number_of_articles(collection_context: Context[CachedCollection]):
+
+    with collection_context.assert_data() as collection:
+        with collection_context.assert_history(1) as latest:
+            print(latest)
+            assert len(collection) == len(latest[0])
 
 
 @when("I list the collection's citation pairs")
@@ -403,20 +369,6 @@ def all_citation_pairs_are_included(
             assert isinstance(reference, Article)
 
 
-@when("I create a collection from that text")
-def create_collection_two_isi_files(
-    isi_text: str,
-    isi_text_different_record: str,
-    collection_context: Context[CachedCollection],
-):
-    buffer_1 = io.StringIO(isi_text)
-    buffer_2 = io.StringIO(isi_text_different_record)
-
-    with collection_context.capture():
-        collection = CachedCollection(buffer_1, buffer_2)
-        collection_context.push(collection)
-
-
 @then("the citation always include all the available data")
 def iterate_over_citation_pairs_two_isi_files(
     iterate_citation_pairs_collection_context: Context[List[Tuple[Article, Article]]]
diff --git a/wostools/_testutils.py b/wostools/_testutils.py
index 2c5b94f..0e4dc9b 100644
--- a/wostools/_testutils.py
+++ b/wostools/_testutils.py
@@ -37,3 +37,8 @@ def assert_data(self, name=None) -> Iterator[T]:
     def assert_error(self) -> Iterator[Exception]:
         assert self.error, f"Expected an error and found none"
         yield self.error
+
+    @contextmanager
+    def assert_history(self, count):
+        assert len(self.history) >= count
+        yield self.history[-count:]

From 78f567395a1ea90988e51b591433f0778ec48b86 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oscar=20Arbel=C3=A1ez?= <odarbelaeze@gmail.com>
Date: Sat, 8 Aug 2020 13:09:52 -0500
Subject: [PATCH 33/35] Update the history

---
 AUTHORS.md | 4 +++-
 HISTORY.md | 6 ++++++
 LICENSE    | 2 +-
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/AUTHORS.md b/AUTHORS.md
index ed8d9a6..2d1388d 100644
--- a/AUTHORS.md
+++ b/AUTHORS.md
@@ -6,4 +6,6 @@
 
 ## Contributors
 
-None yet. Why not be the first?
+-   Oscar Arbeláez \<<odarbelaeze@gmail.com>\>
+-   Juan David Alzate Cardona \<<jdalzatec@gmail.com>\>
+-   Daniel Valencia \<<dsvalenciah@unal.edu.co>\>
diff --git a/HISTORY.md b/HISTORY.md
index 0abe58c..c389fd6 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,5 +1,11 @@
 # History
 
+## 2.0.0 (2020-08-09)
+
+-   Make the article class more concrete
+-   Make collections iterable
+-   Add cached and lazy collections for different use cases
+
 ## 0.2.0 (2018-08-12)
 
 -   Add support for all WOS fields.
diff --git a/LICENSE b/LICENSE
index 40ceae2..8f2c7f5 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2018, Core of Science
+Copyright (c) 2018-2020, Core of Science
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

From f91d0672829ced9c313fb36e78403ca75fa96c17 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oscar=20Arbel=C3=A1ez?= <odarbelaeze@gmail.com>
Date: Sat, 8 Aug 2020 13:21:43 -0500
Subject: [PATCH 34/35] Clean up bumpversion

---
 .zenodo.json         |  9 +++++++--
 requirements_dev.txt |  1 -
 setup.cfg            | 17 -----------------
 3 files changed, 7 insertions(+), 20 deletions(-)

diff --git a/.zenodo.json b/.zenodo.json
index 937d91b..1843f61 100644
--- a/.zenodo.json
+++ b/.zenodo.json
@@ -1,8 +1,8 @@
 {
     "description": "Translates isi web of knowledge files into python objects.",
     "license": "MIT",
-    "title": "coreofscience/python-wostools: Add citation graph support",
-    "version": "v0.2.0",
+    "title": "coreofscience/python-wostools",
+    "version": "v1.1.0",
     "upload_type": "software",
     "publication_date": "2018-08-13",
     "creators": [
@@ -11,6 +11,11 @@
             "affiliation": "Core of science",
             "name": "Oscar David Arbeláe1ez E."
         },
+        {
+            "orcid": "0000-0002-1249-7128",
+            "affiliation": "Core of science",
+            "name": "Juan David Alzate Cardona"
+        },
         {
             "name": "Daniel Stiven Valencia Hernandez",
             "affiliation": "Core of science"
diff --git a/requirements_dev.txt b/requirements_dev.txt
index bf1eec4..a77f5e9 100644
--- a/requirements_dev.txt
+++ b/requirements_dev.txt
@@ -1,4 +1,3 @@
-bumpversion==0.6.0
 flake8==3.8.3
 coverage==5.2.1
 Sphinx==3.1.2
diff --git a/setup.cfg b/setup.cfg
index c5e381a..aefb114 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,20 +1,3 @@
-[bumpversion]
-current_version = 1.1.0
-commit = True
-tag = True
-
-[bumpversion:file:setup.py]
-search = version='{current_version}'
-replace = {new_version}
-
-[bumpversion:file:.zenodo.json]
-search = v{current_version}
-replace = v{new_version}
-
-[bumpversion:file:wostools/__init__.py]
-search = __version__ = '{current_version}'
-replace = {new_version}
-
 [bdist_wheel]
 universal = 1
 

From 5930ed1012088a879b2bf6f055eeb118cdefe7b4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Oscar=20Arbel=C3=A1ez?= <odarbelaeze@gmail.com>
Date: Sat, 8 Aug 2020 13:23:08 -0500
Subject: [PATCH 35/35] Bump the version

---
 .zenodo.json         | 4 ++--
 setup.py             | 2 +-
 wostools/__init__.py | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.zenodo.json b/.zenodo.json
index 1843f61..24fc819 100644
--- a/.zenodo.json
+++ b/.zenodo.json
@@ -2,7 +2,7 @@
     "description": "Translates isi web of knowledge files into python objects.",
     "license": "MIT",
     "title": "coreofscience/python-wostools",
-    "version": "v1.1.0",
+    "version": "v2.0.0",
     "upload_type": "software",
     "publication_date": "2018-08-13",
     "creators": [
@@ -25,7 +25,7 @@
     "related_identifiers": [
         {
             "scheme": "url",
-            "identifier": "https://github.com/coreofscience/python-wostools/tree/v1.1.0",
+            "identifier": "https://github.com/coreofscience/python-wostools/tree/v2.0.0",
             "relation": "isSupplementTo"
         },
         {
diff --git a/setup.py b/setup.py
index 4c3f5ae..9d07288 100644
--- a/setup.py
+++ b/setup.py
@@ -42,7 +42,7 @@
     test_suite="tests",
     tests_require=test_requirements,
     url="https://github.com/coreofscience/python-wostools",
-    version="1.1.0",
+    version="2.0.0",
     zip_safe=False,
     long_description_content_type="text/markdown",
 )
diff --git a/wostools/__init__.py b/wostools/__init__.py
index ca3af1c..5885702 100644
--- a/wostools/__init__.py
+++ b/wostools/__init__.py
@@ -2,7 +2,7 @@
 
 __author__ = """Core of Science"""
 __email__ = "dev@coreofscience.com"
-__version__ = "1.1.0"
+__version__ = "2.0.0"
 
 from wostools.article import Article
 from wostools.lazy import LazyCollection