Skip to content

antakia

AntakIA

AntakIA class.

Antakia instances provide data and methods to explain a ML model.

Instance attributes

X : pd.DataFrame the training dataset y : pd.Series the target value model : Model the model to explain variables : a list of Variables, describing X_list[0] X_test : pd.DataFrame the test dataset y_test : pd.Series the test target value score : reference scoring function

Source code in src/antakia/antakia.py
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
class AntakIA:
    """AntakIA class. 

    Antakia instances provide data and methods to explain a ML model.

    Instance attributes
    -------------------
    X : pd.DataFrame the training dataset
    y : pd.Series the target value
    model : Model
        the model to explain
    variables : a list of Variables, describing X_list[0]
    X_test : pd.DataFrame the test dataset
    y_test : pd.Series the test target value
    score : reference scoring function
    """

    def __init__(
            self,
            X: pd.DataFrame,
            y: pd.Series,
            model,
            variables: DataVariables | List[Dict[str, Any]] | pd.DataFrame | None = None,
            X_test: pd.DataFrame = None,
            y_test: pd.Series = None,
            X_exp: pd.DataFrame | None = None,
            score: callable | str = 'auto',
            problem_category: str = 'auto'
    ):
        """
        AntakiIA constructor.

        Parameters:
            X : pd.DataFrame the training dataset
            y : pd.Series the target value
            model : Model
                the model to explain
            variables : a list of Variables, describing X_list[0]
            X_test : pd.DataFrame the test dataset
            y_test : pd.Series the test target value
            score : reference scoring function
        """

        load_dotenv()

        if not is_valid_model(model):
            raise ValueError(model, " should implement predict and score methods")
        X, y, X_exp = self._preprocess_data(X, y, X_exp)
        X_test, y_test, _ = self._preprocess_data(X_test, y_test, None)

        self.X = X
        if y.ndim > 1:
            y = y.squeeze()
        self.y = y.astype(float)

        self.X_test = X_test
        if y_test is not None and y_test.ndim > 1:
            y_test = y_test.squeeze()
        self.y_test = y_test

        self.model = model

        self.X_exp = X_exp

        self.problem_category = self._preprocess_problem_category(problem_category, model, X)
        self.score = self._preprocess_score(score, self.problem_category)

        self.set_variables(X, variables)

        self.gui = GUI(
            self.X,
            self.y,
            self.model,
            self.variables,
            self.X_test,
            self.y_test,
            self.X_exp,
            self.score,
            self.problem_category
        )

    def set_variables(self, X, variables):
        if variables is not None:
            if isinstance(variables, list):
                self.variables: DataVariables = Variable.import_variable_list(variables)
                if len(self.variables) != len(X.columns):
                    raise ValueError("Provided variable list must be the same length of the dataframe")
            elif isinstance(variables, pd.DataFrame):
                self.variables = Variable.import_variable_df(variables)
            else:
                raise ValueError("Provided variable list must be a list or a pandas DataFrame")
        else:
            self.variables = Variable.guess_variables(X)

    def start_gui(self) -> GUI:
        return self.gui.show_splash_screen()

    def export_regions(self):
        return self.gui.region_set

    def _preprocess_data(self, X: pd.DataFrame, y, X_exp: pd.DataFrame | None):
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X)
        if isinstance(X_exp, np.ndarray):
            X_exp = pd.DataFrame(X_exp)
        if isinstance(y, np.ndarray):
            y = pd.Series(y)

        X.columns = [str(col) for col in X.columns]
        if X_exp is not None:
            X_exp.columns = X.columns

        if X_exp is not None:
            pd.testing.assert_index_equal(X.index, X_exp.index, check_names=False)
            if X.reindex(X_exp.index).iloc[:, 0].isna().sum() != X.iloc[:, 0].isna().sum():
                raise IndexError('X and X_exp must share the same index')
        pd.testing.assert_index_equal(X.index, y.index, check_names=False)
        return X, y, X_exp

    def _preprocess_problem_category(self, problem_category: str, model, X: pd.DataFrame) -> ProblemCategory:
        if problem_category not in [e.name for e in ProblemCategory]:
            raise ValueError('Invalid problem category')
        if problem_category == 'auto':
            if hasattr(model, 'predict_proba'):
                return ProblemCategory['classification_with_proba']
            pred = self.model.predict(self.X.sample(min(100, len(self.X))))
            if len(pred.shape) > 1 and pred.shape[1] > 1:
                return ProblemCategory['classification_proba']
            return ProblemCategory['regression']
        if problem_category == 'classification':
            if hasattr(model, 'prodict_proba'):
                return ProblemCategory['classification_with_proba']
            pred = model.predict(X.sample(min(100, len(X))))
            if len(pred.shape) > 1 and pred.shape[1] > 1:
                return ProblemCategory['classification_proba']
            return ProblemCategory['classification_label_only']
        return ProblemCategory[problem_category]

    def _preprocess_score(self, score, problem_category):
        if callable(score):
            return score
        if score != 'auto':
            return score
        if problem_category == ProblemCategory.regression:
            return 'mse'
        return 'accuracy'

__init__(X, y, model, variables=None, X_test=None, y_test=None, X_exp=None, score='auto', problem_category='auto')

AntakiIA constructor.

Parameters:

Name Type Description Default
X

pd.DataFrame the training dataset

required
y

pd.Series the target value

required
model

Model the model to explain

required
variables

a list of Variables, describing X_list[0]

None
X_test

pd.DataFrame the test dataset

None
y_test

pd.Series the test target value

None
score

reference scoring function

'auto'
Source code in src/antakia/antakia.py
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
def __init__(
        self,
        X: pd.DataFrame,
        y: pd.Series,
        model,
        variables: DataVariables | List[Dict[str, Any]] | pd.DataFrame | None = None,
        X_test: pd.DataFrame = None,
        y_test: pd.Series = None,
        X_exp: pd.DataFrame | None = None,
        score: callable | str = 'auto',
        problem_category: str = 'auto'
):
    """
    AntakiIA constructor.

    Parameters:
        X : pd.DataFrame the training dataset
        y : pd.Series the target value
        model : Model
            the model to explain
        variables : a list of Variables, describing X_list[0]
        X_test : pd.DataFrame the test dataset
        y_test : pd.Series the test target value
        score : reference scoring function
    """

    load_dotenv()

    if not is_valid_model(model):
        raise ValueError(model, " should implement predict and score methods")
    X, y, X_exp = self._preprocess_data(X, y, X_exp)
    X_test, y_test, _ = self._preprocess_data(X_test, y_test, None)

    self.X = X
    if y.ndim > 1:
        y = y.squeeze()
    self.y = y.astype(float)

    self.X_test = X_test
    if y_test is not None and y_test.ndim > 1:
        y_test = y_test.squeeze()
    self.y_test = y_test

    self.model = model

    self.X_exp = X_exp

    self.problem_category = self._preprocess_problem_category(problem_category, model, X)
    self.score = self._preprocess_score(score, self.problem_category)

    self.set_variables(X, variables)

    self.gui = GUI(
        self.X,
        self.y,
        self.model,
        self.variables,
        self.X_test,
        self.y_test,
        self.X_exp,
        self.score,
        self.problem_category
    )