Source code for spec2vec.serialization.model_importing

import json
import os
from typing import Union
import numpy as np
import scipy.sparse
from gensim.models import KeyedVectors


[docs]class Word2VecLight: """ A lightweight version of :class:`~gensim.models.Word2Vec`. The objects of this class follow the interface of the original :class:`~gensim.models.Word2Vec` to the point necessary to calculate Spec2Vec scores. The model cannot be used for further training. """
[docs] def __init__(self, model: dict, weights: Union[np.ndarray, scipy.sparse.csr_matrix, scipy.sparse.csc_matrix]): """ Parameters ---------- model: A dictionary containing the model's metadata. weights: A numpy array or a scipy sparse matrix containing the model's weights. """ self.wv: KeyedVectors = self._KeyedVectorsBuilder().from_dict(model).with_weights(weights).build()
class _KeyedVectorsBuilder: def __init__(self): self.vector_size = None self.weights = None def build(self) -> KeyedVectors: keyed_vectors = KeyedVectors(self.vector_size) keyed_vectors.__dict__ = self.__dict__ keyed_vectors.vectors = self.weights return keyed_vectors def from_dict(self, dictionary: dict): expected_keys = {"vector_size", "__numpys", "__scipys", "__ignoreds", "__recursive_saveloads", "index_to_key", "norms", "key_to_index", "__weights_format"} if dictionary.keys() == expected_keys: self.__dict__ = dictionary elif expected_keys.symmetric_difference(dictionary.keys()) == {"next_index"}: # backward compatibility dictionary.pop("next_index") self.__dict__ = dictionary else: raise ValueError("The keys of model's dictionary representation do not match the expected keys.") return self def with_weights(self, weights: Union[np.ndarray, scipy.sparse.csr_matrix, scipy.sparse.csc_matrix]): self.weights = weights return self
[docs]def import_model(model_file, weights_file) -> Word2VecLight: """ Read a lightweight version of a :class:`~gensim.models.Word2Vec` model from disk. Parameters ---------- model_file: A path of json file to load the model. weights_file: A path of `.npy` file to load the model's weights. Returns ------- :class:`~spec2vec.serialization.model_importing.Word2VecLight` – a lightweight version of a :class:`~gensim.models.Word2Vec` """ with open(model_file, "r", encoding="utf-8") as f: model: dict = json.load(f) weights = load_weights(weights_file, model["__weights_format"]) return Word2VecLight(model, weights)
def load_weights(weights_file: Union[str, os.PathLike], weights_format: str) -> Union[np.ndarray, scipy.sparse.csr_matrix, scipy.sparse.csc_matrix]: weights: np.ndarray = np.load(weights_file, allow_pickle=False) weights_array_builder = {"csr_matrix": scipy.sparse.csr_matrix, "csc_matrix": scipy.sparse.csc_matrix, "np.ndarray": lambda x: x} weights = weights_array_builder[weights_format](weights) return weights