Skip to content

AGNES

toyml.clustering.agnes.AGNES dataclass

AGNES(n_cluster: int, linkage: Literal['single', 'complete', 'average'] = 'single', distance_metric: Literal['euclidean'] = 'euclidean', distance_matrix_: list[list[float]] = list(), clusters_: list[ClusterTree] = list(), labels_: list[int] = list(), cluster_tree_: ClusterTree | None = None, linkage_matrix: list[list[float]] = list(), _cluster_index: int = 0)

Agglomerative clustering algorithm (Bottom-up Hierarchical Clustering).

Examples:

>>> from toyml.clustering import AGNES
>>> dataset = [[1, 0], [1, 1], [1, 2], [10, 0], [10, 1], [10, 2]]
>>> agnes = AGNES(n_cluster=2).fit(dataset)
>>> print(agnes.labels_)
[0, 0, 0, 1, 1, 1]
>>> # Using fit_predict method
>>> labels = agnes.fit_predict(dataset)
>>> print(labels)
[0, 0, 0, 1, 1, 1]
>>> # Using different linkage methods
>>> agnes = AGNES(n_cluster=2, linkage="complete").fit(dataset)
>>> print(agnes.labels_)
[0, 0, 0, 1, 1, 1]
>>> # Plotting dendrogram
>>> agnes = AGNES(n_cluster=1).fit(dataset)
>>> agnes.plot_dendrogram(show=True)
The AGNES Dendrogram Plot

AGNES Dendrogram

References
  1. Zhou Zhihua
  2. Tan

n_cluster instance-attribute

n_cluster: int

The number of clusters, specified by user.

linkage class-attribute instance-attribute

linkage: Literal['single', 'complete', 'average'] = 'single'

The linkage method to use.

distance_metric class-attribute instance-attribute

distance_metric: Literal['euclidean'] = 'euclidean'

The distance metric to use.(For now we only support euclidean).

distance_matrix_ class-attribute instance-attribute

distance_matrix_: list[list[float]] = field(default_factory=list)

The distance matrix.

clusters_ class-attribute instance-attribute

clusters_: list[ClusterTree] = field(default_factory=list)

The clusters.

labels_ class-attribute instance-attribute

labels_: list[int] = field(default_factory=list)

The labels of each sample.

fit

fit(dataset: list[list[float]]) -> AGNES

Fit the model.

Source code in toyml/clustering/agnes.py
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
def fit(self, dataset: list[list[float]]) -> AGNES:
    """Fit the model."""
    self._validate(dataset)
    n = len(dataset)
    self.clusters_ = [ClusterTree(cluster_index=i, sample_indices=[i]) for i in range(n)]
    self._cluster_index = n
    self.distance_matrix_ = self._get_init_distance_matrix(dataset)
    while len(self.clusters_) > self.n_cluster:
        (i, j), cluster_ij_distance = self._get_closest_clusters()
        # merge cluster_i and cluster_j
        self._merge_clusters(i, j, cluster_ij_distance)
        # update distance matrix
        self._update_distance_matrix(dataset, i, j)
    # build cluster_tree_
    self.cluster_tree_ = self._build_cluster_tree(n)
    # assign dataset labels
    self._get_labels(len(dataset))
    return self

fit_predict

fit_predict(dataset: list[list[float]]) -> list[int]

Fit the model and return the labels of each sample.

Source code in toyml/clustering/agnes.py
86
87
88
89
def fit_predict(self, dataset: list[list[float]]) -> list[int]:
    """Fit the model and return the labels of each sample."""
    self.fit(dataset)
    return self.labels_

_validate

_validate(dataset: list[list[float]]) -> None

Validate the dataset.

Source code in toyml/clustering/agnes.py
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
def _validate(self, dataset: list[list[float]]) -> None:
    """Validate the dataset."""
    n = len(dataset)
    # check the number of clusters
    if self.n_cluster > n:
        msg = (
            "The number of clusters should be less than the number of data points,"
            "but got n_cluster={self.n_cluster} and n={n}"
        )
        raise ValueError(
            msg,
        )
    # check the dataset rows
    if any(len(row) != len(dataset[0]) for row in dataset):
        msg = "All samples should have the same dimension"
        raise ValueError(msg)

_get_clusters_distance

_get_clusters_distance(dataset: list[list[float]], cluster1: ClusterTree, cluster2: ClusterTree) -> float

Get the distance between clusters c1 and c2 using the specified linkage method.

Source code in toyml/clustering/agnes.py
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
def _get_clusters_distance(
    self,
    dataset: list[list[float]],
    cluster1: ClusterTree,
    cluster2: ClusterTree,
) -> float:
    """Get the distance between clusters c1 and c2 using the specified linkage method."""
    distances = [
        self._get_distance(dataset[i], dataset[j]) for i in cluster1.sample_indices for j in cluster2.sample_indices
    ]

    if self.linkage == "single":
        return min(distances)
    if self.linkage == "complete":
        return max(distances)
    if self.linkage == "average":
        return sum(distances) / len(distances)
    msg = f"Invalid linkage method: {self.linkage}"
    raise ValueError(msg)

_get_init_distance_matrix

_get_init_distance_matrix(dataset: list[list[float]]) -> list[list[float]]

Gte initial distance matrix from sample points.

Source code in toyml/clustering/agnes.py
128
129
130
131
132
133
134
135
136
137
def _get_init_distance_matrix(self, dataset: list[list[float]]) -> list[list[float]]:
    """Gte initial distance matrix from sample points."""
    n = len(dataset)
    distance_matrix = [[0.0 for _ in range(n)] for _ in range(n)]
    for i, cluster_i in enumerate(self.clusters_):
        for j, cluster_j in enumerate(self.clusters_):
            if j >= i:
                distance_matrix[i][j] = self._get_clusters_distance(dataset, cluster_i, cluster_j)
                distance_matrix[j][i] = distance_matrix[i][j]
    return distance_matrix

_get_closest_clusters

_get_closest_clusters() -> tuple[tuple[int, int], float]

Search the distance matrix to get the closest clusters.

Source code in toyml/clustering/agnes.py
139
140
141
142
143
144
145
146
147
148
def _get_closest_clusters(self) -> tuple[tuple[int, int], float]:
    """Search the distance matrix to get the closest clusters."""
    min_dist = math.inf
    closest_clusters = (0, 0)
    for i in range(len(self.distance_matrix_) - 1):
        for j in range(i + 1, len(self.distance_matrix_)):
            if self.distance_matrix_[i][j] < min_dist:
                min_dist = self.distance_matrix_[i][j]
                closest_clusters = (i, j)
    return closest_clusters, min_dist

_merge_clusters

_merge_clusters(i: int, j: int, cluster_ij_distance: float) -> None

Merge two clusters to a new cluster.

PARAMETER DESCRIPTION
i

the first indices of the clusters to merge

TYPE: int

j

the second indices of the clusters to merge

TYPE: int

cluster_ij_distance

the distance between the two clusters

TYPE: float

Source code in toyml/clustering/agnes.py
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
def _merge_clusters(self, i: int, j: int, cluster_ij_distance: float) -> None:
    """Merge two clusters to a new cluster.

    Args:
        i: the first indices of the clusters to merge
        j: the second indices of the clusters to merge
        cluster_ij_distance: the distance between the two clusters
    """
    cluster_i, cluster_j = self.clusters_[i], self.clusters_[j]
    # build new parent cluster
    parent_cluster = ClusterTree(cluster_index=self._cluster_index)
    self._cluster_index += 1
    # sort the sample indices for convenience
    parent_cluster.sample_indices = sorted(cluster_i.sample_indices + cluster_j.sample_indices)
    parent_cluster.add_child(cluster_i)
    parent_cluster.add_child(cluster_j)
    parent_cluster.children_cluster_distance = cluster_ij_distance
    # replace cluster1 with parent_cluster in clusters_
    self.clusters_[i] = parent_cluster
    self.clusters_.pop(j)

    # linkage matrix
    self.linkage_matrix.append(
        [cluster_i.cluster_index, cluster_j.cluster_index, cluster_ij_distance, len(parent_cluster.sample_indices)],
    )

_update_distance_matrix

_update_distance_matrix(dataset: list[list[float]], i: int, j: int) -> None

Update the distance matrix after merging two clusters.

Source code in toyml/clustering/agnes.py
185
186
187
188
189
190
191
192
193
194
def _update_distance_matrix(self, dataset: list[list[float]], i: int, j: int) -> None:
    """Update the distance matrix after merging two clusters."""
    self.distance_matrix_.pop(j)
    # remove jth column
    for raw in self.distance_matrix_:
        raw.pop(j)
    # calc new dist
    for j in range(len(self.clusters_)):  # noqa: PLR1704
        self.distance_matrix_[i][j] = self._get_clusters_distance(dataset, self.clusters_[i], self.clusters_[j])
        self.distance_matrix_[j][i] = self.distance_matrix_[i][j]

plot_dendrogram

plot_dendrogram(figure_name: str = 'agnes_dendrogram.png', show: bool = False) -> None

Plot the dendrogram of the clustering result.

This method visualizes the hierarchical structure of the clustering using a dendrogram. It requires the number of clusters to be set to 1 during initialization.

PARAMETER DESCRIPTION
figure_name

The filename for saving the plot. Defaults to "agnes_dendrogram.png".

TYPE: str DEFAULT: 'agnes_dendrogram.png'

show

If True, displays the plot. Defaults to False.

TYPE: bool DEFAULT: False

RAISES DESCRIPTION
ValueError

If the number of clusters is not 1.

Note

This method requires matplotlib and scipy to be installed.

Source code in toyml/clustering/agnes.py
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
def plot_dendrogram(
    self,
    figure_name: str = "agnes_dendrogram.png",
    show: bool = False,
) -> None:
    """Plot the dendrogram of the clustering result.

    This method visualizes the hierarchical structure of the clustering
    using a dendrogram. It requires the number of clusters to be set to 1
    during initialization.

    Args:
        figure_name: The filename for saving the plot.
                           Defaults to "agnes_dendrogram.png".
        show: If True, displays the plot. Defaults to False.

    Raises:
        ValueError: If the number of clusters is not 1.

    Note:
        This method requires matplotlib and scipy to be installed.
    """
    import matplotlib.pyplot as plt
    import numpy as np
    from scipy.cluster.hierarchy import dendrogram

    if self.n_cluster != 1:
        msg = "The number of clusters should be 1 to plot dendrogram"
        raise ValueError(msg)
    # Plot the dendrogram
    plt.figure(figsize=(10, 7))
    dendrogram(np.array(self.linkage_matrix))
    plt.title("AGNES Dendrogram")
    plt.xlabel("Sample Index")
    plt.ylabel("Distance")
    plt.savefig(f"{figure_name}", dpi=300, bbox_inches="tight")
    if show:
        plt.show()

toyml.clustering.agnes.ClusterTree dataclass

ClusterTree(cluster_index: int, parent: ClusterTree | None = None, children: list[ClusterTree] = list(), sample_indices: list[int] = list(), children_cluster_distance: float | None = None)

Represents a node in the hierarchical clustering tree.

Each node is a cluster containing sample indices. Leaf nodes represent individual samples, while internal nodes represent merged clusters. The root node contains all samples.

parent class-attribute instance-attribute

parent: ClusterTree | None = None

Parent node.

children class-attribute instance-attribute

children: list[ClusterTree] = field(default_factory=list)

Children nodes.

sample_indices class-attribute instance-attribute

sample_indices: list[int] = field(default_factory=list)

The cluster: dataset sample indices.