Source code for spec2vec.model_building

"""This module contains functions that will help users to train a word2vec model
through gensim.
"""
import logging
from typing import List, Tuple, Union
import gensim
from spec2vec.utils import ModelSaver, TrainingProgressLogger


logger = logging.getLogger("spec2vec")


[docs]def train_new_word2vec_model(documents: List, iterations: Union[List[int], int], filename: str = None,
                             progress_logger: bool = True, **settings) -> gensim.models.Word2Vec:
    """Train a new Word2Vec model (using gensim). Save to file if filename is given.

    Example code on how to train a word2vec model on a corpus (=list of documents)
    that is derived from a given set of spectrums (list of matchms.Spectrum instances):

    .. code-block:: python

        from matchms import SpectrumDocument
        from spec2vec.model_building import train_new_word2vec_model

        documents = [SpectrumDocument(s, n_decimals=1) for s in spectrums]
        model = train_new_word2vec_model(documents, iterations=20, size=200,
                                         workers=1, progress_logger=False)

    Parameters
    ----------
    documents:
        List of documents, each document being a list of words (strings).
    iterations:
        Specifies the number of training interations. This can be done by setting
        iterations to the total number of training epochs (e.g. "iterations=15"),
        or by passing a list of iterations (e.g. "iterations=[5,10,15]") which will
        also led to a total training of max(iterations) epochs, but will save the
        model for every iteration in the list. Temporary models will be saved
        using the name: filename_TEMP_{#iteration}epoch.model".
    filename: str,
        Filename to save model. Default is None, which means no model will be saved.
        If a list of iterations is passed (e.g. "iterations=[5,10,15]"), then
        intermediate models will be saved during training (here after 5, 10
        iterations) using the pattern: filename_TEMP_{#iteration}epoch.model
    learning_rate_initial:
        Set initial learning rate. Default is 0.025.
    learning_rate_decay:
        After every epoch the learning rate will be lowered by the learning_rate_decay.
        Default is 0.00025.
    progress_logger:
        If True, the training progress will be printed every epoch. Default is True.
    **settings
        All other named arguments will be passed to the :py:class:`gensim.models.word2vec.Word2Vec` constructor.
    sg: int (0,1)
        For sg = 0 --> CBOW model, for sg = 1 --> skip gram model
        (see Gensim documentation). Default for Spec2Vec is 0.
    negative: int
        from Gensim:  If > 0, negative sampling will be used, the int for
        negative specifies how many “noise words” should be drawn (usually
        between 5-20). If set to 0, no negative sampling is used.
        Default for Spec2Vec is 5.
    size: int,
        Dimensions of word vectors. Default is 300.
    window: int,
        Window size for context words (small for local context, larger for
        global context). Spec2Vec expects large windwos. Default is 500.
    min_count: int,
        Only consider words that occur at least min_count times in the corpus.
        Default is 1.
    workers: int,
        Number of threads to run the training on (should not be more than
        number of cores/threads. Default is 4.

    Returns
    -------
    word2vec_model
        Gensim word2vec model.
    """
    settings = set_spec2vec_defaults(**settings)

    num_of_epochs = max(iterations) if isinstance(iterations, list) else iterations

    # Convert spec2vec style arguments to gensim style arguments
    settings = learning_rates_to_gensim_style(num_of_epochs, **settings)

    # Set callbacks
    callbacks = []
    if progress_logger:
        training_progress_logger = TrainingProgressLogger(num_of_epochs)
        callbacks.append(training_progress_logger)
    if filename:
        if isinstance(iterations, int):
            iterations = [iterations]
        model_saver = ModelSaver(num_of_epochs, iterations, filename)
        callbacks.append(model_saver)

    # Train word2vec model
    model = gensim.models.Word2Vec(documents, callbacks=callbacks, **settings)

    return model


[docs]def set_spec2vec_defaults(**settings):
    """Set spec2vec default argument values"(where no user input is give)"."""
    defaults = {
        "sg": 0,
        "negative": 5,
        "vector_size": 300,
        "window": 500,
        "min_count": 1,
        "learning_rate_initial": 0.025,
        "learning_rate_decay": 0.00025,
        "workers": 4,
        "compute_loss": True,
    }
    assert "min_alpha" not in settings, "Expect 'learning_rate_decay' to describe learning rate decrease."
    assert "alpha" not in settings, "Expect 'learning_rate_initial' instead of 'alpha'."

    # Set default parameters or replace by **settings input
    for key, value in defaults.items():
        if key in settings:
            msg = f"The value of {key} is set from {value} (default) to {settings[key]}"
            logger.info(msg)
        else:
            settings[key] = value
    return settings


[docs]def learning_rates_to_gensim_style(num_of_epochs, **settings):
    """Convert "learning_rate_initial" and "learning_rate_decay" to gensim
    "alpha" and "min_alpha"."""
    alpha, min_alpha = set_learning_rate_decay(settings["learning_rate_initial"],
                                               settings["learning_rate_decay"], num_of_epochs)
    settings["alpha"] = alpha
    settings["min_alpha"] = min_alpha
    settings["epochs"] = num_of_epochs

    # Remove non-Gensim arguments from settings
    del settings["learning_rate_initial"]
    del settings["learning_rate_decay"]
    return settings


[docs]def set_learning_rate_decay(learning_rate_initial: float, learning_rate_decay: float,
                            num_of_epochs: int) -> Tuple[float, float]:
    """The learning rate in Gensim model training is defined by an initial rate
    (alpha) and a final rate (min_alpha). which can be unintuitive. Here those
    parameters will be set based on the given values for learning_rate_initial,
    num_of_epochs, and learning_rate_decay.

    Parameters
    ----------
    learning_rate_initial:
        Set initial learning rate.
    learning_rate_decay:
        After evert epoch, the learning rate will be lowered by the learning_rate_decay.
    number_of_epochs:
        Total number of epochs for training.

    Returns:
    --------
    alpha:
        Initial learning rate.
    min_alpha:
        Final learning rate.
    """
    min_alpha = learning_rate_initial - num_of_epochs * learning_rate_decay
    if min_alpha < 0:
        msg = ("Number of total iterations is too high for given learning_rate decay.",
               f"Learning_rate_decay will be set from {learning_rate_decay} ",
               "to {learning_rate_initial/num_of_epochs}.")
        logger.warning(msg)
        min_alpha = 0
    return learning_rate_initial, min_alpha
Source code for spec2vec.model_building

spec2vec

Navigation

Related Topics