Source code for sylloge.id_mapped

from dataclasses import dataclass
from typing import Dict, Iterable, Mapping, Optional, Sequence, Tuple

import numpy as np
import pandas as pd

from .base import EADataset, TrainTestValSplit
from .utils import fix_dataclass_init_docs


[docs]def enhance_mapping(
    labels: Iterable, mapping: Mapping[str, int] = None
) -> Dict[str, int]:
    """Map labels with given mapping and enhance mapping if unseen labels are encountered.

    :param labels: Labels to map
    :param mapping: Know mappings
    :return: Enhanced mapping
    """
    mapping = {} if mapping is None else mapping
    new_id = len(mapping)
    enhanced_mapping = {**mapping}
    for label in labels:
        label = str(label)
        if label not in mapping:
            enhanced_mapping[label] = new_id
            new_id += 1
    return enhanced_mapping


[docs]def perform_map(
    triples: np.ndarray,
    head_map: Mapping[str, int],
    rel_map: Mapping[str, int],
    tail_map: Mapping[str, int],
) -> np.ndarray:
    """Map str triples to int ids via dictionaries.

    :param triples: string triples
    :param head_map: mapping for head column
    :param rel_map: mapping for rel column
    :param tail_map: mapping for tail column
    :return: integer id mapped triples
    """
    head_getter = np.vectorize(head_map.get)
    rel_getter = np.vectorize(rel_map.get)
    tail_getter = np.vectorize(tail_map.get)
    # have to use triples with start:end instead of heads/rels/tails variable
    # because this way we get array of shape (n,1) instead of (n,)
    head_column = head_getter(triples[:, 0:1])
    rel_column = rel_getter(triples[:, 1:2])
    tail_column = tail_getter(triples[:, 2:3])
    return np.concatenate([head_column, rel_column, tail_column], axis=1)


[docs]def id_map_rel_triples(
    df: pd.DataFrame,
    entity_mapping: Dict[str, int] = None,
    rel_mapping: Dict[str, int] = None,
) -> Tuple[np.ndarray, Dict[str, int], Dict[str, int]]:
    """Map entity and relation labels to ids and create numpy array.

    :param df: labeled triples
    :param entity_mapping: already mapped entities
    :param rel_mapping: already mapped relations
    :return: id-based numpy array triples, (updated) entity label to id mapping, (updated) relation label to id mapping
    """
    triples = df.astype(str).values
    heads, rels, tails = triples[:, 0], triples[:, 1], triples[:, 2]
    # sorting  ensures consistent results
    entity_labels = sorted(set(heads).union(tails))
    relation_labels = sorted(set(rels))
    entity_mapping = enhance_mapping(entity_labels, entity_mapping)
    rel_mapping = enhance_mapping(relation_labels, rel_mapping)
    return (
        perform_map(triples, entity_mapping, rel_mapping, entity_mapping),
        entity_mapping,
        rel_mapping,
    )


def _id_map_attr_triples(
    df: pd.DataFrame,
    entity_mapping: Dict[str, int],
    attr_rel_mapping: Dict[str, int] = None,
    attr_mapping: Dict[str, int] = None,
) -> Tuple[np.ndarray, Dict[str, int], Dict[str, int], Dict[str, int]]:
    """Map entity, relation labels and attributes to ids and create numpy array.

    :param df: labeled triples
    :param entity_mapping: already mapped entities
    :param attr_rel_mapping: already mapped attribute relations
    :param attr_mapping: already mapped attributes
    :return: id-based numpy array triples, (updated) entity label to id mapping, (updated) relation label to id mapping, (updated) attribute to id mapping
    """
    triples = df.astype(str).values
    heads, rels, tails = triples[:, 0], triples[:, 1], triples[:, 2]
    # sorting  ensures consistent results
    entity_labels = sorted(set(heads))
    relation_labels = sorted(set(rels))
    attributes = sorted(set(tails))
    entity_mapping = enhance_mapping(entity_labels, entity_mapping)
    rel_mapping = enhance_mapping(relation_labels, attr_rel_mapping)
    attr_mapping = enhance_mapping(attributes, attr_mapping)
    return (
        perform_map(triples, entity_mapping, rel_mapping, attr_mapping),
        entity_mapping,
        rel_mapping,
        attr_mapping,
    )


def _map_links(links: pd.DataFrame, entity_mapping: Dict[str, int]) -> np.ndarray:
    """Map links via given mapping.

    :param links: entity links
    :param entity_mapping: label to id mapping
    :return: numpy array with ids
    """
    tuples = links.values
    entity_getter = np.vectorize(entity_mapping.get)
    return np.concatenate(
        [entity_getter(tuples[:, 0:1]), entity_getter(tuples[:, 1:2])], axis=1
    )


[docs]@fix_dataclass_init_docs
@dataclass
class IdMappedTrainTestValSplit:
    """Dataclass holding split of gold standard entity links."""

    #: entity links for training
    train: np.ndarray
    #: entity links for testing
    test: np.ndarray
    #: entity links for validation
    val: np.ndarray


[docs]@fix_dataclass_init_docs
@dataclass
class IdMappedEADataset:
    """Dataclass holding information of the alignment class with mapping of string to numerical id."""

    #: relation triples of left knowledge graph
    rel_triples_left: np.ndarray
    #: relation triples of right knowledge graph
    rel_triples_right: np.ndarray
    #: attribute triples of left knowledge graph
    attr_triples_left: np.ndarray
    #: attribute triples of right knowledge graph
    attr_triples_right: np.ndarray
    #: gold standard entity links of alignment
    ent_links: np.ndarray
    #: label to id mapping for all entities
    entity_mapping: Dict[str, int]
    #: label to id mapping for all relations
    rel_mapping: Dict[str, int]
    #: label to id mapping for all attribute relations
    attr_rel_mapping: Dict[str, int]
    #: attribute to id mapping for all attributes
    attr_mapping: Dict[str, int]
    #: optional pre-split folds of the gold standard
    folds: Optional[Sequence[IdMappedTrainTestValSplit]] = None

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}(rel_triples_left={len(self.rel_triples_left)}, rel_triples_right={len(self.rel_triples_right)}, attr_triples_left={len(self.attr_triples_left)}, attr_triples_right={len(self.attr_triples_right)}, ent_links={len(self.ent_links)}, entity_mapping={len(self.entity_mapping)}, rel_mapping={len(self.rel_mapping)}, attr_rel_mapping={len(self.attr_rel_mapping)}, attr_mapping={len(self.attr_mapping)}, folds={len(self.folds) if self.folds else None})"

    @classmethod
    def from_frames(
        cls,
        rel_triples_left: pd.DataFrame,
        rel_triples_right: pd.DataFrame,
        attr_triples_left: pd.DataFrame,
        attr_triples_right: pd.DataFrame,
        ent_links: pd.DataFrame,
        folds: Optional[Sequence[TrainTestValSplit]],
    ) -> "IdMappedEADataset":
        rel_triples_left, entity_mapping, rel_mapping = id_map_rel_triples(
            rel_triples_left
        )
        rel_triples_right, entity_mapping, rel_mapping = id_map_rel_triples(
            rel_triples_right,
            entity_mapping=entity_mapping,
            rel_mapping=rel_mapping,
        )
        (
            attr_triples_left,
            entity_mapping,
            attr_rel_mapping,
            attr_mapping,
        ) = _id_map_attr_triples(attr_triples_left, entity_mapping=entity_mapping)
        (
            attr_triples_right,
            entity_mapping,
            attr_rel_mapping,
            attr_mapping,
        ) = _id_map_attr_triples(
            attr_triples_right,
            entity_mapping=entity_mapping,
            attr_rel_mapping=attr_rel_mapping,
            attr_mapping=attr_mapping,
        )

        ent_links = _map_links(ent_links, entity_mapping)
        new_folds = None
        if folds:
            new_folds = []
            for fold in folds:
                train = _map_links(fold.train, entity_mapping)
                test = _map_links(fold.test, entity_mapping)
                val = _map_links(fold.val, entity_mapping)
                new_folds.append(
                    IdMappedTrainTestValSplit(train=train, test=test, val=val)
                )
        return cls(
            rel_triples_left=rel_triples_left,
            rel_triples_right=rel_triples_right,
            attr_triples_left=attr_triples_left,
            attr_triples_right=attr_triples_right,
            ent_links=ent_links,
            entity_mapping=entity_mapping,
            rel_mapping=rel_mapping,
            attr_rel_mapping=attr_rel_mapping,
            attr_mapping=attr_mapping,
            folds=new_folds,
        )

    @classmethod
    def from_ea_dataset(cls, dataset: EADataset) -> "IdMappedEADataset":
        return IdMappedEADataset.from_frames(
            rel_triples_left=dataset.rel_triples_left,
            rel_triples_right=dataset.rel_triples_right,
            attr_triples_left=dataset.attr_triples_left,
            attr_triples_right=dataset.attr_triples_right,
            ent_links=dataset.ent_links,
            folds=dataset.folds,
        )