Skip to content

DIANA: Bisecting Kmeans

toyml.clustering.bisect_kmeans.BisectingKmeans dataclass

BisectingKmeans(
    k: int,
    cluster_tree: ClusterTree = ClusterTree(),
    labels: list[int] = list(),
)

Bisecting K-means algorithm. Belong to Divisive hierarchical clustering (DIANA) algorithm.(top-down)

Examples:

>>> from toyml.clustering import BisectingKmeans
>>> dataset = [[1.0, 1.0], [1.0, 2.0], [2.0, 1.0], [10.0, 1.0], [10.0, 2.0], [11.0, 1.0]]
>>> bisect_kmeans = BisectingKmeans(k=3)
>>> labels = bisect_kmeans.fit_predict(dataset)
>>> clusters = bisect_kmeans.cluster_tree.get_clusters()
References
  1. Harrington
  2. Tan
See Also

k instance-attribute

k: int

The number of clusters, specified by user.

cluster_tree class-attribute instance-attribute

cluster_tree: ClusterTree = field(
    default_factory=ClusterTree
)

The cluster tree

labels class-attribute instance-attribute

labels: list[int] = field(default_factory=list)

The cluster labels of the dataset.

fit

fit(dataset: list[list[float]]) -> 'BisectingKmeans'

Fit the dataset with Bisecting K-means algorithm.

PARAMETER DESCRIPTION
dataset

The set of data points for clustering.

TYPE: list[list[float]]

RETURNS DESCRIPTION
'BisectingKmeans'

self.

Source code in toyml/clustering/bisect_kmeans.py
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
def fit(self, dataset: list[list[float]]) -> "BisectingKmeans":
    """
    Fit the dataset with Bisecting K-means algorithm.

    Args:
        dataset: The set of data points for clustering.

    Returns:
        self.

    """
    n = len(dataset)
    # check dataset
    if self.k > n:
        raise ValueError(
            f"Number of clusters(k) cannot be greater than the number of samples(n), not get {self.k=} > {n=}"
        )
    # start with only one cluster which contains all the data points in dataset
    cluster = list(range(n))
    self.cluster_tree.cluster = cluster
    self.cluster_tree.centroid = self._get_cluster_centroids(dataset, cluster)
    self.labels = self._get_dataset_labels(dataset)
    total_error = sum_square_error([dataset[i] for i in cluster])
    # iterate until got k clusters
    while len(self.cluster_tree.get_clusters()) < self.k:
        # init values for later iteration
        to_splot_cluster_node = None
        split_cluster_into: Optional[tuple[list[int], list[int]]] = None
        for cluster_index, cluster_node in enumerate(self.cluster_tree.leaf_cluster_nodes()):
            # perform K-means with k=2
            cluster_data = [dataset[i] for i in cluster_node.cluster]
            # If the cluster cannot be split further, skip it
            if len(cluster_data) < 2:
                continue
            # Bisect by kmeans with k=2
            cluster_unsplit_error, cluster_split_error, (cluster1, cluster2) = self._bisect_by_kmeans(
                cluster_data, cluster_node, dataset
            )
            new_total_error = total_error - cluster_unsplit_error + cluster_split_error
            if new_total_error < total_error:
                total_error = new_total_error
                split_cluster_into = (cluster1, cluster2)
                to_splot_cluster_node = cluster_node

        if to_splot_cluster_node is not None and split_cluster_into is not None:
            self._commit_split(to_splot_cluster_node, split_cluster_into, dataset)
            self.labels = self._get_dataset_labels(dataset)
        else:
            break

    return self

fit_predict

fit_predict(dataset: list[list[float]]) -> list[int]

Fit and predict the cluster label of the dataset.

PARAMETER DESCRIPTION
dataset

The set of data points for clustering.

TYPE: list[list[float]]

RETURNS DESCRIPTION
list[int]

Cluster labels of the dataset samples.

Source code in toyml/clustering/bisect_kmeans.py
210
211
212
213
214
215
216
217
218
219
220
def fit_predict(self, dataset: list[list[float]]) -> list[int]:
    """
    Fit and predict the cluster label of the dataset.

    Args:
        dataset: The set of data points for clustering.

    Returns:
        Cluster labels of the dataset samples.
    """
    return self.fit(dataset).labels

predict

predict(points: list[list[float]]) -> list[int]

Predict the cluster label of the given points.

PARAMETER DESCRIPTION
points

A list of data points to predict.

TYPE: list[list[float]]

RETURNS DESCRIPTION
list[int]

A list of predicted cluster labels for the input points.

RAISES DESCRIPTION
ValueError

If the model has not been fitted yet.

Source code in toyml/clustering/bisect_kmeans.py
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
def predict(self, points: list[list[float]]) -> list[int]:
    """
    Predict the cluster label of the given points.

    Args:
        points: A list of data points to predict.

    Returns:
        A list of predicted cluster labels for the input points.

    Raises:
        ValueError: If the model has not been fitted yet.
    """
    if self.cluster_tree.centroid is None:
        raise ValueError("The model has not been fitted yet.")

    clusters = self.cluster_tree.get_clusters()
    predictions = []
    for point in points:
        node = self.cluster_tree
        while not node.is_leaf():
            if node.left is None or node.right is None:
                raise ValueError("Invalid cluster tree structure.")

            dist_left = euclidean_distance(point, node.left.centroid)  # type: ignore[arg-type]
            dist_right = euclidean_distance(point, node.right.centroid)  # type: ignore[arg-type]

            node = node.left if dist_left < dist_right else node.right
        cluster_index = clusters.index(node.cluster)
        predictions.append(cluster_index)

    return predictions