Source code for sylloge.id_mapped

from dataclasses import dataclass
from typing import Dict, Iterable, Mapping, Optional, Sequence, Tuple

import numpy as np
import pandas as pd

from .utils import fix_dataclass_init_docs


[docs]@fix_dataclass_init_docs @dataclass class PandasTrainTestValSplit: """Dataclass holding split of gold standard entity links.""" #: entity links for training train: pd.DataFrame #: entity links for testing test: pd.DataFrame #: entity links for validation val: pd.DataFrame
[docs]def enhance_mapping( labels: Iterable, mapping: Optional[Mapping[str, int]] = None ) -> Dict[str, int]: """Map labels with given mapping and enhance mapping if unseen labels are encountered. :param labels: Labels to map :param mapping: Know mappings :return: Enhanced mapping """ mapping = {} if mapping is None else mapping new_id = len(mapping) enhanced_mapping = {**mapping} for label in labels: label_str = str(label) if label_str not in mapping: enhanced_mapping[label_str] = new_id new_id += 1 return enhanced_mapping
[docs]def perform_map( triples: np.ndarray, head_map: Mapping[str, int], rel_map: Mapping[str, int], tail_map: Mapping[str, int], ) -> np.ndarray: """Map str triples to int ids via dictionaries. :param triples: string triples :param head_map: mapping for head column :param rel_map: mapping for rel column :param tail_map: mapping for tail column :return: integer id mapped triples """ head_getter = np.vectorize(head_map.get) rel_getter = np.vectorize(rel_map.get) tail_getter = np.vectorize(tail_map.get) # have to use triples with start:end instead of heads/rels/tails variable # because this way we get array of shape (n,1) instead of (n,) head_column = head_getter(triples[:, 0:1]) rel_column = rel_getter(triples[:, 1:2]) tail_column = tail_getter(triples[:, 2:3]) return np.concatenate([head_column, rel_column, tail_column], axis=1)
[docs]def id_map_rel_triples( df: pd.DataFrame, entity_mapping: Optional[Dict[str, int]] = None, rel_mapping: Optional[Dict[str, int]] = None, ) -> Tuple[np.ndarray, Dict[str, int], Dict[str, int]]: """Map entity and relation labels to ids and create numpy array. :param df: labeled triples :param entity_mapping: already mapped entities :param rel_mapping: already mapped relations :return: id-based numpy array triples, (updated) entity label to id mapping, (updated) relation label to id mapping """ triples = df.astype(str).to_numpy() heads, rels, tails = triples[:, 0], triples[:, 1], triples[:, 2] # sorting ensures consistent results entity_labels = sorted(set(heads).union(tails)) relation_labels = sorted(set(rels)) entity_mapping = enhance_mapping(entity_labels, entity_mapping) rel_mapping = enhance_mapping(relation_labels, rel_mapping) return ( perform_map(triples, entity_mapping, rel_mapping, entity_mapping), entity_mapping, rel_mapping, )
def _id_map_attr_triples( df: pd.DataFrame, entity_mapping: Dict[str, int], attr_rel_mapping: Optional[Dict[str, int]] = None, attr_mapping: Optional[Dict[str, int]] = None, ) -> Tuple[np.ndarray, Dict[str, int], Dict[str, int], Dict[str, int]]: """Map entity, relation labels and attributes to ids and create numpy array. :param df: labeled triples :param entity_mapping: already mapped entities :param attr_rel_mapping: already mapped attribute relations :param attr_mapping: already mapped attributes :return: id-based numpy array triples, (updated) entity label to id mapping, (updated) relation label to id mapping, (updated) attribute to id mapping """ triples = df.astype(str).to_numpy() heads, rels, tails = triples[:, 0], triples[:, 1], triples[:, 2] # sorting ensures consistent results entity_labels = sorted(set(heads)) relation_labels = sorted(set(rels)) attributes = sorted(set(tails)) entity_mapping = enhance_mapping(entity_labels, entity_mapping) rel_mapping = enhance_mapping(relation_labels, attr_rel_mapping) attr_mapping = enhance_mapping(attributes, attr_mapping) return ( perform_map(triples, entity_mapping, rel_mapping, attr_mapping), entity_mapping, rel_mapping, attr_mapping, ) def _map_links(links: pd.DataFrame, entity_mapping: Dict[str, int]) -> np.ndarray: """Map links via given mapping. :param links: entity links :param entity_mapping: label to id mapping :return: numpy array with ids """ tuples = links.to_numpy() entity_getter = np.vectorize(entity_mapping.get) return np.concatenate( [entity_getter(tuples[:, 0:1]), entity_getter(tuples[:, 1:2])], axis=1 )
[docs]@fix_dataclass_init_docs @dataclass class IdMappedTrainTestValSplit: """Dataclass holding split of gold standard entity links.""" #: entity links for training train: np.ndarray #: entity links for testing test: np.ndarray #: entity links for validation val: np.ndarray
[docs]@fix_dataclass_init_docs @dataclass class IdMappedEADataset: """Dataclass holding information of the alignment class with mapping of string to numerical id.""" #: relation triples of left knowledge graph rel_triples_left: np.ndarray #: relation triples of right knowledge graph rel_triples_right: np.ndarray #: attribute triples of left knowledge graph attr_triples_left: np.ndarray #: attribute triples of right knowledge graph attr_triples_right: np.ndarray #: gold standard entity links of alignment ent_links: np.ndarray #: label to id mapping for all entities entity_mapping: Dict[str, int] #: label to id mapping for all relations rel_mapping: Dict[str, int] #: label to id mapping for all attribute relations attr_rel_mapping: Dict[str, int] #: attribute to id mapping for all attributes attr_mapping: Dict[str, int] #: optional pre-split folds of the gold standard folds: Optional[Sequence[IdMappedTrainTestValSplit]] = None def __repr__(self) -> str: return f"{self.__class__.__name__}(rel_triples_left={len(self.rel_triples_left)}, rel_triples_right={len(self.rel_triples_right)}, attr_triples_left={len(self.attr_triples_left)}, attr_triples_right={len(self.attr_triples_right)}, ent_links={len(self.ent_links)}, entity_mapping={len(self.entity_mapping)}, rel_mapping={len(self.rel_mapping)}, attr_rel_mapping={len(self.attr_rel_mapping)}, attr_mapping={len(self.attr_mapping)}, folds={len(self.folds) if self.folds else None})" @classmethod def from_frames( cls, rel_triples_left: pd.DataFrame, rel_triples_right: pd.DataFrame, attr_triples_left: pd.DataFrame, attr_triples_right: pd.DataFrame, ent_links: pd.DataFrame, folds: Optional[Sequence[PandasTrainTestValSplit]] = None, ) -> "IdMappedEADataset": id_rel_triples_left, entity_mapping, rel_mapping = id_map_rel_triples( rel_triples_left ) id_rel_triples_right, entity_mapping, rel_mapping = id_map_rel_triples( rel_triples_right, entity_mapping=entity_mapping, rel_mapping=rel_mapping, ) ( id_attr_triples_left, entity_mapping, attr_rel_mapping, attr_mapping, ) = _id_map_attr_triples(attr_triples_left, entity_mapping=entity_mapping) ( id_attr_triples_right, entity_mapping, attr_rel_mapping, attr_mapping, ) = _id_map_attr_triples( attr_triples_right, entity_mapping=entity_mapping, attr_rel_mapping=attr_rel_mapping, attr_mapping=attr_mapping, ) id_ent_links = _map_links(ent_links, entity_mapping) new_folds = None if folds: new_folds = [] for fold in folds: train = _map_links(fold.train, entity_mapping) test = _map_links(fold.test, entity_mapping) val = _map_links(fold.val, entity_mapping) new_folds.append( IdMappedTrainTestValSplit(train=train, test=test, val=val) ) return cls( rel_triples_left=id_rel_triples_left, rel_triples_right=id_rel_triples_right, attr_triples_left=id_attr_triples_left, attr_triples_right=id_attr_triples_right, ent_links=id_ent_links, entity_mapping=entity_mapping, rel_mapping=rel_mapping, attr_rel_mapping=attr_rel_mapping, attr_mapping=attr_mapping, folds=new_folds, )