diff --git a/tools/update_airlines.py b/tools/update_airlines.py index 0af4a5fe..e58e4fae 100644 --- a/tools/update_airlines.py +++ b/tools/update_airlines.py @@ -142,12 +142,23 @@ def diff(self, of, wp): fields = {} for field in ['name', 'callsign', 'icao', 'iata', 'source', 'country', 'country_code', 'start_date', 'end_date', 'duplicate']: + # Do we have a new value? if field in wp and wp[field] and wp[field] != of[field]: + + # Is the change more than just case? if not of[field] or str(wp[field]).upper() != str(of[field]).upper(): + + # Only alter main name if it's not an abbreviation if field != 'name' or len(wp[field]) > 3: + + # Reliable sources can overwrite, unreliable ones can only append if reliable or not of[field]: fields[field] = wp[field] + # Special case: If it's a new name, retain old name as alias + if field == 'name': + fields['alias'] = of['name'] + # Special case: Only override activeness if new source says airline is defunct if of['active'] == 'Y' and wp['active'] == 'N': fields['active'] = 'N' @@ -266,15 +277,21 @@ class Wikidata(object): def load(self, filename): with open(filename, 'rb') as csvfile: reader = unicodecsv.DictReader(csvfile, delimiter=',') - # entity,airlineLabel,iata,icao,callsign,countryLabel,countryIso,startDate,endDate + # entity,airlineLabel,iata,icao,parentIata,parentIcao,callsign,countryLabel,countryIso,startDate,endDate self.parse(reader) def parse(self, rows): for airline in rows: # Ignore airlines where the name is just the Wikidata entity code if airline['entity'].rsplit('/', 1)[1] == airline['airlineLabel']: - print(". IGNORE[WIKIDATA] %s" % airline) + print(". FILTER[WIKIDATA:NONAME] %s" % airline) continue + + # Ignore subsidiaries where the IATA code is identical to parent (commuter brands, cargo operations etc) + if airline['iata'] == airline.get('parentIata'): + print(". FILTER[WIKIDATA:SUBSIDIARY] %s" % airline) + continue + start_date, end_date = airline['startDate'], airline['endDate'] if start_date and not end_date: active = 'Y' diff --git a/tools/update_airlines_test.py b/tools/update_airlines_test.py index c11396f8..3282877e 100644 --- a/tools/update_airlines_test.py +++ b/tools/update_airlines_test.py @@ -66,9 +66,9 @@ def testExactMatchLessReliableSourceUpdateNulls(self): wp = {'icao': 'ABC', 'iata': 'AB', 'name': 'Åland Lol Airlines', 'callsign': 'NEWXA', 'country': 'Åland', 'source': 'User'} self.assertOnlyChange(wp, diff={'callsign': 'NEWXA'}) - def testNameChange(self): + def testNameChangeKeepingOldNameAsAlias(self): wp = {'icao': 'ABC', 'iata': 'AB', 'name': 'Ahvenanmaa Airlines', 'callsign': 'ALAXA', 'country': 'Åland', 'source': 'Wikidata'} - self.assertOnlyChange(wp, diff={'name': 'Ahvenanmaa Airlines'}) + self.assertOnlyChange(wp, diff={'name': 'Ahvenanmaa Airlines', 'alias': 'Åland Airlines'}) def testCountryChange(self): wp = {'icao': 'ABC', 'iata': 'AB', 'name': 'Åland Airlines', 'callsign': 'ALAXA', 'country': 'Ahvenanmaa', 'source': 'Wikidata'} @@ -80,12 +80,12 @@ def testCountryCodeChange(self): def testActiveToInactiveChange(self): self.of['active'] = 'Y' - wp = {'icao': 'ABC', 'iata': 'AB', 'name': 'Ahvenanmaa Airlines', 'callsign': 'ALAXA', 'country': 'Åland', 'source': 'Wikidata', 'active': 'N', 'end_date': '2019-01-01'} - self.assertOnlyChange(wp, diff={'name': 'Ahvenanmaa Airlines', 'active': 'N', 'end_date': '2019-01-01'}) + wp = {'icao': 'ABC', 'iata': 'AB', 'name': 'Åland Airlines', 'callsign': 'ALAXA', 'country': 'Åland', 'source': 'Wikidata', 'active': 'N', 'end_date': '2019-01-01'} + self.assertOnlyChange(wp, diff={'active': 'N', 'end_date': '2019-01-01'}) def testIgnoreInactiveToActiveChange(self): - wp = {'icao': 'ABC', 'iata': 'AB', 'name': 'Ahvenanmaa Airlines', 'callsign': 'ALAXA', 'country': 'Åland', 'source': 'Wikidata', 'active': 'N', 'end_date': '2019-01-01'} - self.assertOnlyChange(wp, diff={'name': 'Ahvenanmaa Airlines', 'end_date': '2019-01-01'}) + wp = {'icao': 'ABC', 'iata': 'AB', 'name': 'Åland Airlines', 'callsign': 'ALAXA', 'country': 'Åland', 'source': 'Wikidata', 'active': 'N', 'end_date': '2019-01-01'} + self.assertOnlyChange(wp, diff={'end_date': '2019-01-01'}) def testIgnoreCaseChange(self): wp = {'icao': 'ABC', 'iata': 'AB', 'name': 'Åland AIRLINES', 'callsign': 'ALAXA', 'country': 'Åland', 'source': 'Wikidata'} @@ -220,6 +220,11 @@ def testWikidataIgnoreIfNameEqualsEntity(self): self.wd.parse([wiki]) self.assertEqual(self.wd.airlines, []) + def testWikidataIgnoreIfSameIATAAsParent(self): + wiki = {'entity': 'http://www.wikidata.org/entity/Q3487216', 'airlineLabel': 'Abba Airlines', 'iata': 'TF', 'icao': '', 'parentIata': 'TF', 'callsign': '', 'countryLabel': 'Sweden', 'countryIso': 'SE', 'startDate': '2023-01-01', 'endDate': ''} + self.wd.parse([wiki]) + self.assertEqual(self.wd.airlines, []) + # # Processing tests # diff --git a/tools/wikidata-airlines.sparql b/tools/wikidata-airlines.sparql index 8f2aefef..90750e2b 100644 --- a/tools/wikidata-airlines.sparql +++ b/tools/wikidata-airlines.sparql @@ -1,18 +1,20 @@ # SPARQL query for extracting airline data from Wikidata # Execute at https://query.wikidata.org/ and download results as CSV -SELECT (?airline as ?entity) ?airlineLabel ?iata ?icao ?callsign ?countryLabel ?countryIso +SELECT (?airline as ?entity) ?airlineLabel ?iata ?icao ?parentIata ?parentIcao ?callsign ?countryLabel ?countryIso (xsd:date(?start) AS ?startDate) (xsd:date(?end) AS ?endDate) # format as ISO dates WHERE { ?airline wdt:P31 wd:Q46970 . # instance of airline ?airline wdt:P229 ?iata . # IATA code not optional OPTIONAL{?airline wdt:P230 ?icao .} + OPTIONAL{?airline wdt:P749 ?parent . + ?parent wdt:P229 ?parentIata . + ?parent wdt:P230 ?parentIcao } OPTIONAL{?airline wdt:P571 ?start .} OPTIONAL{?airline wdt:P576 ?end .} OPTIONAL { ?airline wdt:P17 ?country . ?country wdt:P297 ?countryIso } OPTIONAL{?airline wdt:P432 ?callsign .} - SERVICE wikibase:label { bd:serviceParam wikibase:language "en". } + SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". } } -