crawl

#!/usr/bin/env python

# Copyright (C) 2016 Dan Scott <dscott@laurentian.ca>

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

"""
"""

import argparse
import datetime
import html5lib
import json
import logging
import os
import rdflib
import requests
import socket
import sys
import traceback
import urllib
import urllib.parse
from rdflib.namespace import DC, DCTERMS, FOAF, OWL, RDF, XMLNS

# Workaround bad SSL implementations per http://askubuntu.com/a/488277
import ssl
ssl.PROTOCOL_SSLv23 = ssl.PROTOCOL_TLSv1

class LibraryGraph:
    "A graph of all libraries of interest"

    def __init__(self, log_file):
        self.graph = rdflib.ConjunctiveGraph()
        self.log_file = log_file
        self.bind_namespaces(self.graph)

        logformat = '%(asctime)s %(levelname)s:%(message)s'

        if self.log_file:
            logging.basicConfig(format=logformat, filename=self.log_file, level=logging.INFO, datefmt='%Y:%m:%d %H:%M:%S')
        else:
            logging.basicConfig(format=logformat, stream=sys.stdout, level=logging.INFO, datefmt='%Y:%m:%d %H:%M:%S')

        # requests module is very chatty
        logging.getLogger("requests").setLevel(logging.WARNING)
        logging.getLogger("rdflib").setLevel(logging.WARNING)

    def load_json(self, json_file):
        try:
            jsonp = open(json_file, 'r')
        except FileNotFoundError:
            e = "JSON file %s not found: harvest the CWRC data first" % (json_file)
            logging.error(e)
            sys.exit(e)

        with jsonp:
            self.set_library_data(json.loads(jsonp.read()))

    def set_library_data(self, libraries):
        self.libraries = libraries

    def to_tsv(self, tsv_file):
        "Generate TSV representation of JSON"

        with open(tsv_file, 'w') as fp:
            for x in self.libraries:
                if x['url']:
                    fp.write("%s\t%s\t%s\t%s\t%s\n" % (x['url'], x['label'], x['community'], x['provCode'], x['subGroup']))

    def gather_library_html(self, maxlinks=None, overwrite=False, robots=False):
        """
        Downloads the HTML for each listed library

        This enables us to parse the HTML locally for RDFa, microdata, and
        JSON-LD without hammering the live site. We log redirects and
        errors encountered when accessing the URL so that we can investigate
        later.
        """

        tested = []
        headers = {'User-Agent': "Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:43.0) Gecko/20100101 Firefox/43.0"}
        res = {"good": 1, "bad": 1}
        save_path = 'html'
        cnt = 0
        session = requests.Session()
        for lib in self.libraries:
            cnt += 1

            if maxlinks and cnt > maxlinks:
                continue

            if not lib['url']:
                url_problem("No URL", lib, res)
                continue

            u = None
            if robots:
                save_path = 'robots'
                parts = urllib.parse.urlparse(lib['url'])
                u = "%s://%s/robots.txt" % (parts.scheme, parts.netloc)
            else:
                u = clean_url(lib, cnt)

            if u in tested:
                logging.warning("%d\t%s\t%s\tSkipping duplicate" % (cnt, lib['label'], u))
                continue

            tested.append(u)

            if u[0:4] != 'http':
                url_problem("Not an HTTP URL", lib, res)
                continue

            logging.info("%d\t%s\t%s\tChecking..." % (cnt, lib['label'], u))
            try:
                if overwrite is False and os.stat(os.path.join(save_path, _url_to_filename(u))):
                    logging.debug("%d\t%s\t%s\tExists, skipping!" % (cnt, lib['label'], u))
                    res['good'] += 1
                    continue
            except:
                logging.info("%d\t%s\t%s\tGetting..." % (cnt, lib['label'], u))

            try:
                response = session.get(u, headers=headers, timeout=30)
                code = response.status_code
                if code >= 400:
                    # Log so we can investigate the problem
                    url_problem(code, lib, res)
                    return
                elif u != response.url and code == 301: # permanent
                    # Logged so we can add a urlPrevious entry to the JSON
                    logging.info("Redirect: %s\t%s" % (u, response.url))

                # write out the HTML
                logging.info("Writing HTML: %s" % (_url_to_filename(u)))
                self.write(os.path.join(save_path, _url_to_filename(u)), response.text)

                res['good'] += 1
            except socket.timeout:
                url_problem("timeout", lib, res)
            except (Exception) as error:
                url_problem(error.__class__.__name__, lib, res)

        return res

    def correct_base(self, graph, file_path, uri):
        """
        Swap the file system base for the URL base

        pyMicrodata appears to ignore the publicId parse() parameter and
        uses the filesystem source instead, which makes for pretty horrible
        graph identifiers.
        """

        for sub, pred, obj in graph:
            file_prefix = 'file://'
            if sub.startswith(file_prefix):
                uri_sub = rdflib.URIRef(urllib.parse.urljoin(uri, sub[len(os.path.join(file_prefix, file_path)):]))
                graph.remove((sub, pred, obj))
                graph.add((uri_sub, pred, obj))
            if obj.startswith(file_prefix):
                uri_obj = rdflib.URIRef(urllib.parse.urljoin(uri, urllib.parse.unquote(obj)[len(os.path.join(file_prefix, 'html')):]))
                graph.remove((sub, pred, obj))
                graph.add((sub, pred, uri_obj))

    def remove_xhtml_roles(self, graph):
        "XHTML roles clutter up the graph; remove them"

        xhtml = rdflib.Namespace('http://www.w3.org/1999/xhtml/vocab#')
        for sub, pred, obj in graph:
            if pred == xhtml.role:
                graph.remove((sub, pred, obj))

    def extract_lod_all(self, maxlinks=None):
        "Extract RDF from RDFa in returned HTML"

        cnt = 0
        for lib in self.libraries:
            cnt += 1
            if maxlinks and maxlinks > cnt:
                continue

            if not lib['url']:
                continue

            self.extract_lod(lib, cnt)

    def serialize(self, rdf_format='turtle'):
        "Return the graph in serialized form"

        return str(self.graph.serialize(format=rdf_format).decode('utf-8'))

    def bind_namespaces(self, graph):
        "Bind common namespaces"

        ogp = rdflib.Namespace('http://ogp.me/ns#')
        purlrss = rdflib.Namespace('http://purl.org/rss/1.0/modules/content/')
        rdfa = rdflib.Namespace('http://www.w3.org/ns/rdfa#')
        schema = rdflib.Namespace('http://schema.org/')
        sioc = rdflib.Namespace('http://rdfs.org/sioc/ns#')
        twitter = rdflib.Namespace('twitter:')
        xhtml = rdflib.Namespace('http://www.w3.org/1999/xhtml/vocab#')
        mdata = rdflib.Namespace('http://www.w3.org/ns/md#')

        graph.bind('dc', DC)
        graph.bind('dcterms', DCTERMS)
        graph.bind('md', mdata)
        graph.bind('ogp', ogp)
        graph.bind('rdfa', rdfa)
        graph.bind('schema', schema)
        graph.bind('sioc', sioc)
        graph.bind('xhtml', xhtml)
        graph.bind('foaf', FOAF)
        graph.bind('owl', OWL)
        graph.bind('rdf', RDF)
        graph.bind('twitter', twitter)
        graph.bind('xmlns', XMLNS)
        graph.bind('purlrss', purlrss)

    def extract_lod(self, lib, cnt):
        "Parse and write the graph for a single URL"

        # For html5lib / xml.etree XPathery
        ns = { 'xhtml': 'http://www.w3.org/1999/xhtml' }

        try:
            rawurl = clean_url(lib)
            g = rdflib.Graph(identifier=rdflib.URIRef(rawurl))
            fname = _url_to_filename(rawurl)
            fpath = os.path.join('html', fname)
            if not os.stat(fpath):
                return
            logging.info("%d\t%s\t%s\tParsing..." % (cnt, lib['label'], lib['url']))

            with open(fpath, 'r') as fp:
                # format='html' invokes both RDFa and Microdata parsers
                r = g.parse(file=fp, format='html', publicID=rdflib.URIRef(rawurl))
                self.bind_namespaces(g)
                self.correct_base(g, fpath, rdflib.URIRef(rawurl))

            with open(fpath, 'r') as fp:
                raw_data = fp.read()
                # parse JSON+LD islands
                tree = html5lib.parse(raw_data)
                for script in tree.findall('.//xhtml:script[@type="application/ld+json"]', ns):
                    g.parse(data=script.text, format='json-ld', publicID=rdflib.URIRef(rawurl))

            self.remove_xhtml_roles(g)
            g = self.stupid_hacks(g, rawurl)

            self.graph += g

            ttl = os.path.join('ttl', _url_to_filename(rawurl))
            try:
                os.stat(ttl)
            except:
                os.makedirs(ttl)

            with open(os.path.join(ttl, "triples.ttl"), "w") as out:
                out.write(str(g.serialize(format='turtle').decode('utf-8')))

        except (urllib.error.HTTPError) as inst:
            logging.error("%s %s %s" % (inst.code, inst.msg, inst.filename))
        except (Exception) as inst:
            logging.exception("Problem extracting data from URL %s" %(rawurl))

    def save_html(self, u, text, save_path):
        "Save the contents of the URL to disk"

    def stupid_hacks(self, graph, url):
        """
        Manifest the graph as turtle so we can use string-based hacks

        This is an awful way to fix these problems, but in some cases
        they have been introduced by the parsers (and outstanding bugs
        are waiting to be fixed), and in most cases I can't think of a
        clean way to manipulate the graph directly.
        """
        turtle = str(graph.serialize(format='turtle').decode('utf-8'))
        # Stupid hack to avoid blank nodes
        turtle = turtle.replace("\n\n[]", "\n\n<%s>" % (url))
        # Stupid hacks to avoid mdata empty nodes
        turtle = turtle.replace("\n\n<%s> md:item () ." % (url), "")
        turtle = turtle.replace("\n    md:item () ;", "\n")
        turtle = turtle.replace(";\n    md:item () .", ".")
        turtle = turtle.replace(" md:item () ;\n    ", " ")
        # Stupid hack to consolidate schema.org namespaces
        turtle = turtle.replace("www.schema.org", "schema.org")

        g = rdflib.Graph()
        g.parse(data=turtle, format='turtle', publicID=rdflib.URIRef(url))
        return g

    def write(self, filepath, content):
        "Write content to a file, creating the directory if necessary"
        try:
            os.mkdir(os.path.dirname(filepath))
        except:
            logging.info("Directory '%s' already exists" % (os.path.dirname(filepath)))

        try:
            with open(filepath, 'w') as f:
                logging.info("Writing to file '%s'" % (filepath))
                f.write(content)
        except (Exception) as error:
            logging.error("Could not open %s" % (filepath))

def clean_url(lib, cnt=0):
    "Remove spaces and unicode from the URL"

    u = lib['url'].strip().strip('\u200e').strip('\u200f')
    if u != lib['url']:
        logging.warning("%d\t%s\t[%s]\tURL has spaces or RTL/LTR Unicode" % (cnt, lib['label'], lib['url']))

    return u

def _url_to_filename(u):
    """
    Convert a URL to a writable filename

    Some libraries include Google Analytics GET params which generate URLs so
    long that they can't be used as filenames, so we strip out any "utm_"
    params.
    """

    parsed = urllib.parse.urlparse(u)

    removekeys = []
    qs = urllib.parse.parse_qs(parsed.query)

    # remove any GA keys
    for k in qs:
        if k.startswith('utm_'):
            removekeys.append(k)
    for k in removekeys:
        del qs[k]

    # swap in the cleaned params
    cleaned = parsed._replace(query=urllib.parse.urlencode(qs))

    return urllib.parse.quote(cleaned.geturl(), safe='')

def url_problem(problem, lib, r):
    "Abstract the many potential URL problems"
    logging.error("%s\t%s\t%s" % (lib['label'], lib['url'], problem))
    r['bad'] += 1

def main():
    "Main code"

    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("-l", "--libraries", help="JSON list of libraries to crawl", default="libraries_all.json")
    parser.add_argument("--ttl", help="File in which TTL should be saved", default="dialled.ttl")
    parser.add_argument("--log", help="Log file", default="crawl.log")
    parser.add_argument("--tsv-out", help="Convert JSON to TSV output", action="store_true")
    parser.add_argument("--tsv-file", help="File in which TSV should be saved", default="libraries_all.tsv")
    parser.add_argument("--html-refresh", help="Refresh HTML from each web site; otherwise previously stored HTML will be used", action="store_true")
    parser.add_argument("--robots", help="Gather robots.txt from each web site", action="store_true")
    args = parser.parse_args()

    libgraph = LibraryGraph(args.log)
    libgraph.load_json(args.libraries)
    logging.info('Starting')
    if args.tsv_out:
        libgraph.to_tsv(args.tsv_file)
    if args.html_refresh:
        libgraph.gather_library_html(overwrite=True)
    if args.robots:
        libgraph.gather_library_html(None, True, True)
    libgraph.extract_lod_all()
    today = datetime.date.today().isoformat()
    
    n3out = os.path.join('datadumps', "dialled_%s" % (today), args.ttl)
    libgraph.write(n3out, libgraph.serialize())

    logging.info('Finished')


if __name__ == '__main__':
    main()