Source code for sasctl.pzmm.model_parameters

# Copyright (c) 2022, SAS Institute Inc., Cary, NC, USA.  All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
import json
from distutils.version import StrictVersion
from pathlib import Path
from typing import Any, Optional, Tuple, Union

import pandas as pd
from pandas import DataFrame

from .._services.model_repository import ModelRepository as mr
from ..core import RestObj, current_session, is_uuid

try:
    import xgboost
except:
    xgboost = None

MODEL_PROPERTIES = [
    ("targetVariable", "targetVariable"),
    ("targetLevel", "targetLevel"),
    ("targetEventValue", "targetEvent"),
    ("eventProbabilityVariable", "eventProbVar"),
    ("function", "function"),
]


# TODO: Maybe just move _find_file altogether?
def _find_file(model: Union[str, dict, RestObj], file_name: str) -> Tuple[RestObj, str]:
    """
    Retrieves the contents of the first file from a registered model on SAS Model
    Manager that contains the provided file_name as an exact match or substring.

    Parameters
    ----------
    model : str or dict
        The name or id of the model, or a dictionary representation of the model.
    file_name : str
        The name of the desired file or a substring that is contained within the file
        name.

    Returns
    -------
    RestObj, str
        The contents and name of the first file with a name containing file_name.
    """

    file_list = mr.get_model_contents(model)
    for file in file_list:
        if file_name.lower() in file.name.lower():
            correct_file = mr.get(f"models/{model}/contents/{file.id}/content")
            return correct_file, file.name
    raise ValueError(f'No file containing "{file_name}" exists within model files.')



[docs]
class ModelParameters:
    @staticmethod
    def _update_json(model: str, model_json: dict, kpis: DataFrame) -> dict:
        """
        Updates the contents of the hyperparameter json file

        Parameters
        ----------
        model: str
            The id of the model being updated.
        model_json: dict
            The contents of the current KPI/parameters file within SAS Model Manager.
        kpis: pandas.DataFrame
            The dataframe containing the KPI/parameter values stored within SAS Model
            Manager at runtime.

        Returns
        -------
        dict
            The updated hyperparameter json file to be uploaded to SAS Model Manager.
        """

        model_rows = kpis.loc[kpis["ModelUUID"] == model]
        if not model_rows.empty:
            model_rows = model_rows.drop(columns=["ModelUUID"])
            model_rows.set_index("TimeLabel", inplace=True)
            kpi_json = model_rows.to_json(orient="index")
            parsed_json = json.loads(kpi_json)
            model_json["kpis"] = parsed_json
        return model_json


[docs]
    @staticmethod
    def generate_hyperparameters(
        model: Any, model_prefix: str, pickle_path: Union[str, Path]
    ) -> None:
        """
        Generates hyperparameters for a given model and creates a JSON file
        representation.

        Currently only supports generation of scikit-learn model hyperparameters.

        This function creates a json file named {model_prefix}Hyperparameters.json.

        Parameters
        ----------
        model : Any
            Python object representing the model.
        model_prefix : str
            Name used to create model files. (e.g. (model_prefix) +
            "Hyperparameters.json")
        pickle_path : str, pathlib.Path
            Directory location of model files.
        """

        def sklearn_params():
            """
            Generates hyperparameters for the models generated by scikit-learn.
            """
            hyperparameters = model.get_params()
            model_json = {"hyperparameters": hyperparameters}
            with open(
                Path(pickle_path) / f"{model_prefix}Hyperparameters.json", "w"
            ) as f:
                f.write(json.dumps(model_json, indent=4))

        def tf_params():
            """
            Generates hyperparameters for the models generated by tensorflow.
            """
            hyperparameters = model.get_config()
            model_json = {"hyperparameters": hyperparameters}
            with open(
                Path(pickle_path) / f"{model_prefix}Hyperparameters.json", "w"
            ) as f:
                f.write(json.dumps(model_json, indent=4))

        def xg_params():
            """
            Generates hyperparameters for the models generated by xgboost.
            """
            if not xgboost:
                raise RuntimeError(
                    "XGBoost is required to generate xgboost hyperparameters."
                )
            hyperparameters = json.loads(model.save_config())
            model_json = {"hyperparameters": hyperparameters}
            with open(
                Path(pickle_path) / f"{model_prefix}Hyperparameters.json", "w"
            ) as f:
                f.write(json.dumps(model_json, indent=4))

        def h2o_params():
            """
            Generates hyperparameters for the models generated by h2o.ai.
            """
            hyperparameters = model.get_params()
            model_json = {"hyperparameters": hyperparameters}
            with open(
                Path(pickle_path) / f"{model_prefix}Hyperparameters.json", "w"
            ) as f:
                f.write(json.dumps(model_json, indent=4))

        def statsmodels_params():
            """
            Generates hyperparameters for the models generated by statsmodels.
            """
            hyperparameters = dict()
            hyperparameters["model_type"] = model.__class__.__name__
            hyperparameters["input_variables"] = model.exog_names
            hyperparameters["weights"] = model.weights.tolist()
            model_json = {"hyperparameters": hyperparameters}
            with open(
                Path(pickle_path) / f"{model_prefix}Hyperparameters.json", "w"
            ) as f:
                f.write(json.dumps(model_json, indent=4))

        if model.__class__.__module__.__contains__("sklearn"):
            sklearn_params()
        elif model.__class__.__module__.startswith("keras"):
            tf_params()
        elif model.__class__.__module__.startswith("xgboost"):
            xg_params()
        elif model.__class__.__module__.startswith("h2o"):
            h2o_params()
        elif model.__class__.__module__.startswith("statsmodels"):
            statsmodels_params()

        else:
            raise ValueError(
                "This model type is not currently supported for hyperparameter "
                "generation."
            )



[docs]
    @classmethod
    def update_kpis(
        cls,
        project: Union[str, dict, RestObj],
        server: Optional[str] = "cas-shared-default",
        caslib: Optional[str] = "ModelPerformanceData",
    ) -> None:
        """
        Updates hyperparameter file to include KPIs generated by performance
        definitions, as well as any custom KPIs imported by user to the SAS KPI data
        table.

        Parameters
        ----------
        project : str, dict, or RestObj
            The name or id of the project, or a dictionary representation of the
            project.
        server : str, optional
            Server on which the KPI data table is stored. The default value is
            "cas-shared-default".
        caslib : str, optional
            CAS Library on which the KPI data table is stored. The default value is
            "ModelPerformanceData".
        """
        kpis = cls.get_project_kpis(project, server, caslib)
        models_to_update = kpis["ModelUUID"].unique().tolist()

        for model in models_to_update:
            try:
                current_params, file_name = _find_file(model, "hyperparameters")
            except:
                print(
                    f'No hyperparameter file for current model {kpis.loc[kpis["ModelUUID"]==model, "ModelName"].iloc[0]}. Attempting for next model...'
                )
            else:
                updated_json = cls._update_json(model, current_params, kpis)
                mr.add_model_content(
                    model, json.dumps(updated_json, indent=4), file_name
                )



[docs]
    @staticmethod
    def get_hyperparameters(model: Union[str, dict, RestObj]) -> Tuple[dict, str]:
        """
        Retrieves the hyperparameter json file from specified model on SAS Model
        Manager.

        Parameters
        ----------
        model : str, dict, or RestObj
            The name or id of the model, or a dictionary representation of the model.

        Returns
        -------
        dict, str
            Dictionary containing the contents of the hyperparameter file and the file
            name.
        """
        if mr.is_uuid(model):
            id_ = model
        elif isinstance(model, dict) and "id" in model:
            id_ = model["id"]
        else:
            model = mr.get_model(model)
            id_ = model["id"]
        file_contents, file_name = _find_file(id_, "hyperparameters")
        return file_contents, file_name



[docs]
    @classmethod
    def add_hyperparameters(cls, model: Union[str, dict, RestObj], **kwargs) -> None:
        """
        Adds custom hyperparameters to the hyperparameter file contained within the
        model in SAS Model Manager.

        Parameters
        ----------
        model : str, dict, or RestObj
            The name or id of the model, or a dictionary representation of the model.
        **kwargs
            Named variables pairs representing hyperparameters to be added to the
            hyperparameter file.
        """

        if mr.is_uuid(model):
            id_ = model
        elif isinstance(model, dict) and "id" in model:
            id_ = model["id"]
        else:
            model = mr.get_model(model)
            id_ = model["id"]
        hyperparameters, file_name = cls.get_hyperparameters(id_)
        for key, value in kwargs.items():
            hyperparameters["hyperparameters"][key] = value
        mr.add_model_content(
            model,
            json.dumps(hyperparameters, indent=4),
            file_name,
        )



[docs]
    @staticmethod
    def get_project_kpis(
        project: Union[str, dict, RestObj],
        server: Optional[str] = "cas-shared-default",
        caslib: Optional[str] = "ModelPerformanceData",
        filter_column: Optional[str] = None,
        filter_value: Optional[str] = None,
    ) -> DataFrame:
        """
        Create a call to CAS to return the MM_STD_KPI table (SAS Model Manager
        Standard KPI) generated when custom KPIs are uploaded or when a performance
        definition is executed on SAS Model Manager on SAS Viya 4.

        Filtering options are available as additional arguments. The filtering is based
        on column name and column value. Currently, only exact matches are available
        when filtering by this method.

        Parameters
        ----------
        project : str, dict, RestObj
            The name or id of the project, or a dictionary representation of the
            project.
        server : str, optional
            SAS Viya 4 server where the MM_STD_KPI table exists. The default value is
            "cas-shared-default".
        caslib : str, optional
            SAS Viya 4 caslib where the MM_STD_KPI table exists. The default value is
            "ModelPerformanceData".
        filter_column : str, optional
            Column name from the MM_STD_KPI table to be filtered. The default value is
            None.
        filter_value : str, optional
            Column value filter by. The default value is None

        Returns
        -------
        kpi_table_df : pandas.DataFrame
            A pandas DataFrame representing the MM_STD_KPI table. Note that SAS
            missing values are replaced with pandas-valid missing values.
        """
        # Check the pandas version for where the json_normalize function exists
        if pd.__version__ >= StrictVersion("1.0.3"):
            from pandas import json_normalize
        else:
            from pandas.io.json import json_normalize

        # Collect the current session for authentication of API calls
        sess = current_session()

        # Step through options to determine project UUID
        if is_uuid(project):
            project_id = project
        elif isinstance(project, dict) and "id" in project:
            project_id = project["id"]
        else:
            project = mr.get_project(project)
            project_id = project["id"]

        # TODO: include case for large MM_STD_KPI tables
        # Call the casManagement service to collect the column names in the table
        kpi_table_columns = sess.get(
            f"casManagement/servers/{server}/"
            + f"caslibs/{caslib}/tables/"
            + f"{project_id}.MM_STD_KPI/columns?limit=10000"
        )
        if not kpi_table_columns:
            project = mr.get_project(project)
            raise SystemError(
                f"No KPI table exists for project {project.name}."
                + " Please confirm that the performance definition completed"
                + " or custom KPIs have been uploaded successfully."
            )
        # Parse through the json response to create a pandas DataFrame
        cols = json_normalize(kpi_table_columns.json(), "items")
        # Convert the columns to a readable list
        col_names = cols["name"].to_list()

        # Filter rows returned by column and value provided in arguments
        where_statement = ""
        if filter_column and filter_value:
            where_statement = f"&where={filter_column}='{filter_value}'"

        # Call the casRowSets service to return row values
        # Optional where statement is included
        kpi_table_rows = sess.get(
            f"casRowSets/servers/{server}/"
            + f"caslibs/{caslib}/tables/"
            + f"{project_id}.MM_STD_KPI/rows?limit=10000"
            + f"{where_statement}"
        )
        # If no "cells" are found in the json response, return an error
        try:
            kpi_table_df = pd.DataFrame(
                json_normalize(kpi_table_rows.json()["items"])["cells"].to_list(),
                columns=col_names,
            )
        except KeyError:
            if filter_column and filter_value:
                raise SystemError(
                    "No KPIs were found when filtering with {filter_column}='{"
                    "filter_value}'."
                )
            else:
                project_name = mr.get_project(project)["name"]
                raise SystemError(f"No KPIs were found for project {project_name}.")

        # Strip leading spaces from cells of KPI table; convert missing values to None
        kpi_table_df = kpi_table_df.apply(lambda x: x.str.strip()).replace(
            {".": None, "": None}
        )

        return kpi_table_df



[docs]
    @staticmethod
    def sync_model_properties(
        project: Union[str, dict, RestObj], overrwrite: Optional[bool] = False
    ):
        # Step through options to determine project UUID
        if is_uuid(project):
            project_id = project
        elif isinstance(project, dict) and "id" in project:
            project_id = project["id"]
        else:
            project = mr.get_project(project)
            project_id = project["id"]
        # Get List of Models that exist in project
        models = mr.get(f"/projects/{project_id}/models")
        model_ids = [model.id for model in models]
        for id in model_ids:
            model = mr.get_model(id)
            for project_property, model_property in MODEL_PROPERTIES:
                # Check if property is set in project
                if project_property in project:
                    # If property is set in project, check if it's set in model, and update model accordingly
                    if model_property not in model or overrwrite:
                        model[model_property] = project[project_property]
            mr.update_model(model)