Skip to content

Kmeans

toyml.clustering.kmeans.Kmeans dataclass

Kmeans(k: int, max_iter: int = 500, tol: float = 1e-05, centroids_init_method: Literal['random', 'kmeans++'] = 'random', random_seed: int | None = None, distance_metric: Literal['euclidean'] = 'euclidean', iter_: int = 0, clusters_: dict[int, list[int]] = dict(), centroids_: dict[int, list[float]] = dict(), labels_: list[int] = list())

K-means algorithm (with Kmeans++ initialization as option).

Examples:

>>> from toyml.clustering import Kmeans
>>> dataset = [[1.0, 2.0], [1.0, 4.0], [1.0, 0.0], [10.0, 2.0], [10.0, 4.0], [11.0, 0.0]]
>>> kmeans = Kmeans(k=2, random_seed=42).fit(dataset)
>>> kmeans.clusters_
{0: [3, 4, 5], 1: [0, 1, 2]}
>>> kmeans.centroids_
{0: [10.333333333333334, 2.0], 1: [1.0, 2.0]}
>>> kmeans.labels_
[1, 1, 1, 0, 0, 0]
>>> kmeans.predict([0, 1])
1
>>> kmeans.iter_
2

There is a fit_predict method that can be used to fit and predict.

Examples:

>>> from toyml.clustering import Kmeans
>>> dataset = [[1, 0], [1, 1], [1, 2], [10, 0], [10, 1], [10, 2]]
>>> Kmeans(k=2, random_seed=42).fit_predict(dataset)
[1, 1, 1, 0, 0, 0]
References
  1. Zhou Zhihua
  2. Murphy
Note

Here we just implement the naive K-means algorithm.

See Also

k instance-attribute

k: int

The number of clusters, specified by user.

max_iter class-attribute instance-attribute

max_iter: int = 500

The number of iterations the algorithm will run for if it does not converge before that.

tol class-attribute instance-attribute

tol: float = 1e-05

The tolerance for convergence.

centroids_init_method class-attribute instance-attribute

centroids_init_method: Literal['random', 'kmeans++'] = 'random'

The method to initialize the centroids.

random_seed class-attribute instance-attribute

random_seed: int | None = None

The random seed used to initialize the centroids.

distance_metric class-attribute instance-attribute

distance_metric: Literal['euclidean'] = 'euclidean'

The distance metric to use.(For now we only support euclidean).

clusters_ class-attribute instance-attribute

clusters_: dict[int, list[int]] = field(default_factory=dict)

The clusters of the dataset.

centroids_ class-attribute instance-attribute

centroids_: dict[int, list[float]] = field(default_factory=dict)

The centroids of the clusters.

labels_ class-attribute instance-attribute

labels_: list[int] = field(default_factory=list)

The cluster labels of the dataset.

fit

fit(dataset: list[list[float]]) -> Kmeans

Fit the dataset with K-means algorithm.

PARAMETER DESCRIPTION
dataset

the set of data points for clustering

TYPE: list[list[float]]

RETURNS DESCRIPTION
Kmeans

self.

Source code in toyml/clustering/kmeans.py
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
def fit(self, dataset: list[list[float]]) -> Kmeans:
    """Fit the dataset with K-means algorithm.

    Args:
        dataset: the set of data points for clustering

    Returns:
        self.
    """
    self.centroids_ = self._get_initial_centroids(dataset)
    for _ in range(self.max_iter):
        self.iter_ += 1
        prev_centroids = self.centroids_
        self._iter_step(dataset)
        if self._is_converged(prev_centroids):
            break
    return self

_iter_step

_iter_step(dataset: list[list[float]]) -> None

Can be used to control the fitting process step by step.

Source code in toyml/clustering/kmeans.py
91
92
93
94
95
def _iter_step(self, dataset: list[list[float]]) -> None:
    """Can be used to control the fitting process step by step."""
    self.clusters_ = self._get_clusters(dataset)
    self.centroids_ = self._get_centroids(dataset)
    self.labels_ = self._predict_dataset_labels(dataset)

fit_predict

fit_predict(dataset: list[list[float]]) -> list[int]

Fit and predict the cluster label of the dataset.

PARAMETER DESCRIPTION
dataset

the set of data points for clustering

TYPE: list[list[float]]

RETURNS DESCRIPTION
list[int]

Cluster labels of the dataset samples.

Source code in toyml/clustering/kmeans.py
 97
 98
 99
100
101
102
103
104
105
106
def fit_predict(self, dataset: list[list[float]]) -> list[int]:
    """Fit and predict the cluster label of the dataset.

    Args:
        dataset: the set of data points for clustering

    Returns:
        Cluster labels of the dataset samples.
    """
    return self.fit(dataset).labels_

predict

predict(point: list[float]) -> int

Predict the label of the point.

PARAMETER DESCRIPTION
point

The data point to predict.

TYPE: list[float]

RETURNS DESCRIPTION
int

The label of the point.

Source code in toyml/clustering/kmeans.py
108
109
110
111
112
113
114
115
116
117
118
119
120
121
def predict(self, point: list[float]) -> int:
    """Predict the label of the point.

    Args:
        point: The data point to predict.

    Returns:
        The label of the point.

    """
    if len(self.centroids_) == 0:
        msg = "The model is not fitted yet"
        raise ValueError(msg)
    return self._get_point_centroid_label(point, self.centroids_)

_get_initial_centroids

_get_initial_centroids(dataset: list[list[float]]) -> dict[int, list[float]]

Get initial centroids by a simple random selection.

Source code in toyml/clustering/kmeans.py
137
138
139
140
141
142
143
144
def _get_initial_centroids(self, dataset: list[list[float]]) -> dict[int, list[float]]:
    """Get initial centroids by a simple random selection."""
    if self.centroids_init_method == "random":
        return self._get_initial_centroids_random(dataset)
    if self.centroids_init_method == "kmeans++":
        return self._get_initial_centroids_kmeans_plus(dataset)
    msg = f"Invalid centroids initialization method: {self.centroids_init_method}"
    raise ValueError(msg)

_is_converged

_is_converged(prev_centroids: dict[int, list[float]]) -> bool

Check if the centroids converged.

PARAMETER DESCRIPTION
prev_centroids

previous centroids

TYPE: dict[int, list[float]]

RETURNS DESCRIPTION
bool

Whether the centroids converged.

Source code in toyml/clustering/kmeans.py
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
def _is_converged(self, prev_centroids: dict[int, list[float]]) -> bool:
    """Check if the centroids converged.

    Args:
        prev_centroids: previous centroids

    Returns:
        Whether the centroids converged.
    """
    # check every centroid
    for i, centroid in self.centroids_.items():
        prev_centroid = prev_centroids[i]
        # check every dimension
        for j in range(len(prev_centroid)):
            if abs(prev_centroid[j] - centroid[j]) > self.tol:
                return False
    return True

_get_initial_centroids_random

_get_initial_centroids_random(dataset: list[list[float]]) -> dict[int, list[float]]

Get initial centroids by a simple random selection.

PARAMETER DESCRIPTION
dataset

The dataset for clustering

TYPE: list[list[float]]

RETURNS DESCRIPTION
dict[int, list[float]]

The initial centroids

Source code in toyml/clustering/kmeans.py
164
165
166
167
168
169
170
171
172
173
174
175
def _get_initial_centroids_random(self, dataset: list[list[float]]) -> dict[int, list[float]]:
    """Get initial centroids by a simple random selection.

    Args:
        dataset: The dataset for clustering

    Returns:
        The initial centroids
    """
    data_points = random.sample(dataset, self.k)
    centroids = dict(enumerate(data_points))
    return centroids

_get_initial_centroids_kmeans_plus

_get_initial_centroids_kmeans_plus(dataset: list[list[float]]) -> dict[int, list[float]]

Get initial centroids by k-means++ algorithm.

PARAMETER DESCRIPTION
dataset

The dataset for clustering

TYPE: list[list[float]]

RETURNS DESCRIPTION
dict[int, list[float]]

The initial centroids

Source code in toyml/clustering/kmeans.py
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
def _get_initial_centroids_kmeans_plus(self, dataset: list[list[float]]) -> dict[int, list[float]]:
    """Get initial centroids by k-means++ algorithm.

    Args:
        dataset: The dataset for clustering

    Returns:
        The initial centroids
    """
    self.centroids_ = {}
    self.centroids_[0] = random.choice(dataset)
    for i in range(1, self.k):
        min_distances = [self._get_min_square_distance(point) for point in dataset]
        total_dist = sum(min_distances)
        weights = [dist / total_dist for dist in min_distances]
        self.centroids_[i] = random.choices(dataset, weights)[0]
    return self.centroids_

_get_min_square_distance

_get_min_square_distance(point: list[float]) -> float

Get the minimum square distance from the point to current centroids.

PARAMETER DESCRIPTION
point

The point to calculate the distance.

TYPE: list[float]

RETURNS DESCRIPTION
float

The minimum square distance

Source code in toyml/clustering/kmeans.py
195
196
197
198
199
200
201
202
203
204
205
206
207
def _get_min_square_distance(self, point: list[float]) -> float:
    """Get the minimum square distance from the point to current centroids.

    Args:
        point: The point to calculate the distance.

    Returns:
        The minimum square distance
    """
    return min(
        (self._calc_distance(point, centroid) ** 2 for centroid in self.centroids_.values() if centroid),
        default=math.inf,
    )

_get_point_centroid_label

_get_point_centroid_label(point: list[float], centroids: dict[int, list[float]]) -> int

Get the label of the centroid, which is closest to the point.

Source code in toyml/clustering/kmeans.py
209
210
211
212
def _get_point_centroid_label(self, point: list[float], centroids: dict[int, list[float]]) -> int:
    """Get the label of the centroid, which is closest to the point."""
    distances = [(i, self._calc_distance(point, centroid)) for i, centroid in centroids.items()]
    return min(distances, key=lambda x: x[1])[0]