Source code for sylloge.med_bbk_loader

import pathlib
from typing import Any, Dict, Literal, Optional, Union, overload

import dask.dataframe as dd
import pandas as pd

from .base import (
    BACKEND_LITERAL,
    BASE_DATASET_MODULE,
    BinaryZipEADataset,
    DataFrameType,
)

MED_BBK_MODULE = BASE_DATASET_MODULE.module("med_bbk")


[docs]class MED_BBK(BinaryZipEADataset[DataFrameType]): """Class containing the MED-BBK dataset. Published in `Zhang, Z. et. al. (2020) An Industry Evaluation of Embedding-based Entity Alignment <A Benchmarking Study of Embedding-based Entity Alignment for Knowledge Graphs>`_, *COLING* """ #: The link to the zip file _ZIP_LINK: str = ( "https://github.com/ZihengZZH/industry-eval-EA/raw/main/benchmark/industry.zip" ) #: The hex digest for the zip file _SHA512: str = "da1ee2b025070fd6890fb7e77b07214af3767b5ae85bcdc1bb36958b4b8dd935bc636e3466b94169158940a960541f96284e3217d32976bfeefa56e29d4a9e0d" @overload def __init__( self: "MED_BBK[pd.DataFrame]", backend: Literal["pandas"] = "pandas", use_cache: bool = True, cache_path: Optional[Union[str, pathlib.Path]] = None, ): ... @overload def __init__( self: "MED_BBK[dd.DataFrame]", backend: Literal["dask"] = "dask", use_cache: bool = True, cache_path: Optional[Union[str, pathlib.Path]] = None, ): ... def __init__( self, backend: BACKEND_LITERAL = "pandas", use_cache: bool = True, cache_path: Optional[Union[str, pathlib.Path]] = None, ): """Initialize an MED-BBK dataset. :param backend: Whether to use "pandas" or "dask" :param use_cache: whether to use cache or not :param cache_path: Path where cache will be stored/loaded """ # ensure zip file is present zip_path = MED_BBK_MODULE.ensure( url=MED_BBK._ZIP_LINK, download_kwargs=dict(hexdigests=dict(sha512=MED_BBK._SHA512)), # noqa: C408 ) inner_path = "industry" actual_cache_path = self.create_cache_path( MED_BBK_MODULE, inner_path, cache_path ) super().__init__( # type: ignore[misc] cache_path=actual_cache_path, use_cache=use_cache, zip_path=zip_path, inner_path=pathlib.PurePosixPath(inner_path), backend=backend, # type: ignore[arg-type] dataset_names=("MED", "BBK"), )
[docs] def initial_read(self, backend: BACKEND_LITERAL) -> Dict[str, Any]: # MED is KG2 and BBK is KG1 inital_dict = super().initial_read(backend=backend) return { "rel_triples": inital_dict["rel_triples"][::-1], "attr_triples": inital_dict["attr_triples"][::-1], "ent_links": inital_dict["ent_links"], }
@property def _canonical_name(self) -> str: return f"{self.__class__.__name__}" @property def _param_repr(self) -> str: return ""