Skip to content

Commit

Permalink
feat: activity datapoint vectorizer that makes use of field-specific …
Browse files Browse the repository at this point in the history
…DSPNs
  • Loading branch information
Francesco Stablum committed Nov 25, 2021
1 parent a825644 commit 2a6be55
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 14 deletions.
16 changes: 14 additions & 2 deletions common/relspecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,7 @@
)
)

from common import persistency
from common import utils
from common import persistency, utils, config
from models import text_model


Expand Down Expand Up @@ -87,6 +86,19 @@ def glue(self, tensor_list): # FIXME: maybe to some other module?
ret = tensor_list
return ret

def extract_from_activity_data(self,activity_data):
ret = {}
for k, v in activity_data.items():
m = re.match(f'{self.name}_(.*)', k)
if m is not None:
rel_field = m.group(1)
if rel_field in self.fields_names:
# cap the amount of items to config.download_max_set_size
v = v[:config.download_max_set_size]
# logging.info(f"considering field {rel_field}")
ret[rel_field] = v
return ret

@property
def scalers(self):
return [curr.scaler for curr in self.fields]
Expand Down
14 changes: 2 additions & 12 deletions preprocess/dag.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,18 +70,8 @@ def parse(page, ti):
for activity in data['response']['docs']:
activity_id = activity['iati_identifier']
# logging.info(f"processing activity {activity_id}")
for k, v in activity.items():
# logging.info(f"processing activity item {k}")
for rel in rels:
# logging.info(f"processing rel {rel.name}")
m = re.match(f'{rel.name}_(.*)', k)
if m is not None:
rel_field = m.group(1)
if rel_field in rel.fields_names:
# cap the amount of items to config.download_max_set_size
v = v[:config.download_max_set_size]
# logging.info(f"considering field {rel_field}")
rels_vals[rel.name][activity_id][rel_field] = v
for rel in rels:
rels_vals[rel.name][activity_id] = rel.extract_from_activity_data(activity)

for rel, sets in rels_vals.items():
remove = []
Expand Down
22 changes: 22 additions & 0 deletions preprocess/vectorize_activity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from common import relspecs, utils
from models import models_storage
import numpy as np

class Activity(utils.Collection):
def __init__(self, activity_data):
for rel in relspecs:
self[rel.name] = rel.extract_from_activity_data(activity_data)


class ActivityVectorizer(object):
def __init__(self):
self.model_storage = models_storage.DSPNAEModelsStorage()

def vectorize_activity(self, activity):
vectorized_fields = []
for rel in relspecs:
field_data = activity[rel.name]
vectorized_field = self.model_storage[rel.name].encoder(field_data)
vectorized_fields.append(vectorized_field)
ret = np.hstack(vectorized_fields)
return ret

0 comments on commit 2a6be55

Please sign in to comment.