-
Notifications
You must be signed in to change notification settings - Fork 11
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: add population for regions; closes ON-1510
Signed-off-by: Evan Prodromou <[email protected]>
- Loading branch information
Showing
8 changed files
with
41,213 additions
and
0 deletions.
There are no files selected for viewing
3,804 changes: 3,804 additions & 0 deletions
3,804
harmonize/data/processed/OEF:WD:subnational-population:20240322/ActorIdentifier.csv
Large diffs are not rendered by default.
Oops, something went wrong.
2 changes: 2 additions & 0 deletions
2
harmonize/data/processed/OEF:WD:subnational-population:20240322/DataSource.csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
datasource_id,name,publisher,published,URL | ||
OEF:WD:subnational-population:20240322,Wikidata extract of subnational objects and their populations for years since 1990,OEF:WD,2024-03-22,https://github.com/Open-Earth-Foundation/OpenClimate/ |
9 changes: 9 additions & 0 deletions
9
harmonize/data/processed/OEF:WD:subnational-population:20240322/DataSourceTag.csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
datasource_id,tag_id | ||
OEF:WD:subnational-population:20240322,geo | ||
OEF:WD:subnational-population:20240322,contextual | ||
OEF:WD:subnational-population:20240322,population | ||
OEF:WD:subnational-population:20240322,wikidata | ||
OEF:WD:subnational-population:20240322,cc0 | ||
OEF:WD:subnational-population:20240322,extract | ||
OEF:WD:subnational-population:20240322,oef | ||
OEF:WD:subnational-population:20240322,evanp |
16,462 changes: 16,462 additions & 0 deletions
16,462
harmonize/data/processed/OEF:WD:subnational-population:20240322/Population.csv
Large diffs are not rendered by default.
Oops, something went wrong.
2 changes: 2 additions & 0 deletions
2
harmonize/data/processed/OEF:WD:subnational-population:20240322/Publisher.csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
id,name,URL | ||
OEF:WD,Open Earth Foundation extracts from Wikidata,https://github.com/Open-Earth-Foundation/OpenClimate-harmonize |
9 changes: 9 additions & 0 deletions
9
harmonize/data/processed/OEF:WD:subnational-population:20240322/Tag.csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
tag_id,tag_name | ||
geo,geographical data | ||
contextual,contextual data | ||
population,population of an area | ||
wikidata,data extracted from Wikidata | ||
cc0,Creative Commons CC0 license | ||
extract,Dataset extracted from a larger database | ||
oef,Dataset from Open Earth Foundation | ||
evanp,Dataset by Evan Prodromou |
20,785 changes: 20,785 additions & 0 deletions
20,785
...onize/data/raw/wikidata-subnational-population-2024-3/wikidata_subnational_population.csv
Large diffs are not rendered by default.
Oops, something went wrong.
140 changes: 140 additions & 0 deletions
140
harmonize/scripts/wikidata_subnational_population_2024_3.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,140 @@ | ||
import logging | ||
|
||
apihost = None | ||
INPUT_FILE = "../data/raw/wikidata-subnational-population-2024-3/wikidata_subnational_population.csv" | ||
OUTPUT_DIR = "../data/processed/OEF:WD:subnational-population:20240322" | ||
ACTOR_FILE = "../data/processed/ISO-3166-2/Actor.csv" | ||
|
||
PUBLISHER = { | ||
"id": "OEF:WD", | ||
"name": "Open Earth Foundation extracts from Wikidata", | ||
"URL": "https://github.com/Open-Earth-Foundation/OpenClimate-harmonize" | ||
} | ||
|
||
DATASOURCE = { | ||
"datasource_id": "OEF:WD:subnational-population:20240322", | ||
"name": "Wikidata extract of subnational objects and their populations for years since 1990", | ||
"publisher": PUBLISHER["id"], | ||
"published": "2024-03-22", | ||
"URL": "https://github.com/Open-Earth-Foundation/OpenClimate/" | ||
} | ||
|
||
TAGS = [ | ||
{'tag_id': 'geo', | ||
'tag_name': 'geographical data'}, | ||
{'tag_id': 'contextual', | ||
'tag_name': 'contextual data'}, | ||
{'tag_id': 'population', | ||
'tag_name': 'population of an area'}, | ||
{'tag_id': 'wikidata', | ||
'tag_name': 'data extracted from Wikidata'}, | ||
{'tag_id': 'cc0', | ||
'tag_name': 'Creative Commons CC0 license'}, | ||
{'tag_id': 'extract', | ||
'tag_name': 'Dataset extracted from a larger database'}, | ||
{'tag_id': 'oef', | ||
'tag_name': 'Dataset from Open Earth Foundation'}, | ||
{'tag_id': 'evanp', | ||
'tag_name': 'Dataset by Evan Prodromou'} | ||
] | ||
|
||
import csv | ||
|
||
def slurp_file(name): | ||
data = [] | ||
with open(name) as csvfile: | ||
reader = csv.DictReader(csvfile) | ||
for row in reader: | ||
data.append(row) | ||
return data | ||
|
||
def write_csv(name, rows): | ||
with open(f'{OUTPUT_DIR}/{name}.csv', mode='w') as csvfile: | ||
writer = csv.DictWriter(csvfile, fieldnames=rows[0].keys()) | ||
writer.writeheader() | ||
writer.writerows(rows) | ||
|
||
def read_actors(filename): | ||
actors = None | ||
with open(filename, 'r') as csvfile: | ||
reader = csv.DictReader(csvfile) | ||
actors = {row['actor_id']: row for row in reader} | ||
return actors | ||
|
||
def ensure_dir(dirname): | ||
import os | ||
if not os.path.exists(dirname): | ||
os.makedirs(dirname) | ||
|
||
def main(): | ||
|
||
actors = read_actors(ACTOR_FILE) | ||
|
||
input = slurp_file(INPUT_FILE) | ||
|
||
actor_identifiers = {} | ||
values = {} | ||
populations = [] | ||
|
||
|
||
for row in input: | ||
|
||
actor_id = row['iso31662'] | ||
population = round(float(row['population'])) | ||
year = int(row['populationYear']) | ||
|
||
if not actor_id in actors: | ||
logging.info(f'skipping {actor_id}') | ||
continue | ||
else: | ||
logging.info(f'processing {actor_id}') | ||
|
||
if actor_id not in actor_identifiers: | ||
qno = row['item'].replace('http://www.wikidata.org/entity/', '') | ||
actor_identifiers[actor_id] = { | ||
'actor_id': actor_id, | ||
'identifier': qno, | ||
'namespace': 'Wikidata', | ||
'datasource_id': DATASOURCE['datasource_id'] | ||
} | ||
|
||
# Note: we may have years with multiple records | ||
|
||
if actor_id not in values: | ||
values[actor_id] = {} | ||
|
||
if year not in values[actor_id]: | ||
values[actor_id][year] = [] | ||
|
||
values[actor_id][year].append(population) | ||
|
||
for id, years in values.items(): | ||
for year, pops in years.items(): | ||
populations.append({ | ||
'actor_id': id, | ||
'population': max(pops), # FIXME: better check! | ||
'year': year, | ||
'datasource_id': DATASOURCE['datasource_id'] | ||
}) | ||
|
||
ensure_dir(OUTPUT_DIR) | ||
|
||
write_csv('Publisher', [PUBLISHER]) | ||
write_csv('DataSource', [DATASOURCE]) | ||
write_csv('Tag', TAGS) | ||
write_csv('DataSourceTag', list(map(lambda t: {'datasource_id': DATASOURCE['datasource_id'], 'tag_id': t['tag_id']}, TAGS))) | ||
write_csv('ActorIdentifier', list(actor_identifiers.values())) | ||
write_csv('Population', populations) | ||
|
||
if __name__ == "__main__": | ||
import os | ||
import argparse | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument('-A', '--api', help='API host prefix', default=(os.environ.get('OPENCLIMATE_API') or 'https://openclimate.network')) | ||
parser.add_argument('-d', '--debug', action='store_true', help='flag for running debug') | ||
args = parser.parse_args() | ||
|
||
apihost = args.api | ||
logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) | ||
|
||
main() |