Skip to content

DBSCAN

toyml.clustering.dbscan.DBSCAN dataclass

DBSCAN(
    eps: float = 0.5,
    min_samples: int = 5,
    clusters_: list[list[int]] = list(),
    core_objects_: set[int] = set(),
    noises_: list[int] = list(),
    labels_: list[int] = list(),
)

DBSCAN algorithm

Examples:

>>> from toyml.clustering import DBSCAN
>>> dataset = [[1, 2], [2, 2], [2, 3], [8, 7], [8, 8], [25, 80]]
>>> dbscan = DBSCAN(eps=3, min_samples=2).fit(dataset)
>>> dbscan.clusters_
[[0, 1, 2], [3, 4]]
>>> dbscan.noises_
[5]
>>> dbscan.labels_
[0, 0, 0, 1, 1, -1]
References
  1. Zhou Zhihua
  2. Han
  3. Kassambara
  4. Wikipedia

eps class-attribute instance-attribute

eps: float = 0.5

The maximum distance between two samples for one to be considered as in the neighborhood of the other. This is not a maximum bound on the distances of points within a cluster. This is the most important DBSCAN parameter to choose appropriately for your data set and distance function. (same as sklearn)

min_samples class-attribute instance-attribute

min_samples: int = 5

The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This includes the point itself. If min_samples is set to a higher value, DBSCAN will find denser clusters, whereas if it is set to a lower value, the found clusters will be more sparse. (same as sklearn)

clusters_ class-attribute instance-attribute

clusters_: list[list[int]] = field(default_factory=list)

The clusters found by the DBSCAN algorithm.

core_objects_ class-attribute instance-attribute

core_objects_: set[int] = field(default_factory=set)

The core objects found by the DBSCAN algorithm.

noises_ class-attribute instance-attribute

noises_: list[int] = field(default_factory=list)

The noises found by the DBSCAN algorithm.

labels_ class-attribute instance-attribute

labels_: list[int] = field(default_factory=list)

The cluster labels found by the DBSCAN algorithm.

fit

fit(data: list[list[float]]) -> 'DBSCAN'

Fit the DBSCAN model.

PARAMETER DESCRIPTION
data

The dataset.

TYPE: list[list[float]]

RETURNS DESCRIPTION
self

The fitted DBSCAN model.

TYPE: 'DBSCAN'

Source code in toyml/clustering/dbscan.py
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
def fit(self, data: list[list[float]]) -> "DBSCAN":
    """
    Fit the DBSCAN model.

    Args:
        data: The dataset.

    Returns:
        self: The fitted DBSCAN model.
    """
    dataset = Dataset(data)

    # initialize the unvisited set
    unvisited = set(range(dataset.n))
    # get core objects
    self.core_objects_, self.noises_ = dataset.get_core_objects(self.eps, self.min_samples)

    # core objects used for training
    if len(self.core_objects_) == 0:
        logger.warning("No core objects found, all data points are noise. Try to adjust the hyperparameters.")
        return self

    # set of core objects: unordered
    core_object_set = self.core_objects_.copy()
    while core_object_set:
        unvisited_old = unvisited.copy()
        core_object = core_object_set.pop()
        queue: deque[int] = deque()
        queue.append(core_object)
        unvisited.remove(core_object)

        while queue:
            q = queue.popleft()
            neighbors = dataset.get_neighbors(q, self.eps)
            if len(neighbors) + 1 >= self.min_samples:
                delta = set(neighbors) & unvisited
                for point in delta:
                    queue.append(point)
                    unvisited.remove(point)

        cluster = unvisited_old - unvisited
        self.clusters_.append(list(cluster))
        core_object_set -= cluster

    self.labels_ = [-1] * dataset.n  # -1 means noise
    for i, cluster_index in enumerate(self.clusters_):
        for j in cluster_index:
            self.labels_[j] = i
    return self

fit_predict

fit_predict(data: list[list[float]]) -> list[int]

Fit the DBSCAN model and return the cluster labels.

PARAMETER DESCRIPTION
data

The dataset.

TYPE: list[list[float]]

RETURNS DESCRIPTION
list[int]

The cluster labels.

Source code in toyml/clustering/dbscan.py
107
108
109
110
111
112
113
114
115
116
117
def fit_predict(self, data: list[list[float]]) -> list[int]:
    """
    Fit the DBSCAN model and return the cluster labels.

    Args:
        data: The dataset.

    Returns:
        The cluster labels.
    """
    return self.fit(data).labels_

toyml.clustering.dbscan.Dataset dataclass

Dataset(
    data: list[list[float]],
    distance_metric: Literal["euclidean"] = "euclidean",
)

Dataset for DBSCAN

PARAMETER DESCRIPTION
data

The dataset.

TYPE: list[list[float]]

ATTRIBUTE DESCRIPTION
data

The dataset.

TYPE: list[list[float]]

n

The number of data points.

TYPE: int

distance_matrix_

The distance matrix.

TYPE: list[list[float]]

distance_metric class-attribute instance-attribute

distance_metric: Literal['euclidean'] = 'euclidean'

The distance metric to use.(For now we only support euclidean).

get_neighbors

get_neighbors(i: int, eps: float) -> list[int]

Get the neighbors of the i-th data point.

PARAMETER DESCRIPTION
i

The index of the data point.

TYPE: int

eps

The maximum distance between two samples for one to be considered as in the neighborhood of the other.

TYPE: float

RETURNS DESCRIPTION
list[int]

The indices of the neighbors (Don't include the point itself).

Source code in toyml/clustering/dbscan.py
152
153
154
155
156
157
158
159
160
161
162
163
def get_neighbors(self, i: int, eps: float) -> list[int]:
    """
    Get the neighbors of the i-th data point.

    Args:
        i: The index of the data point.
        eps: The maximum distance between two samples for one to be considered as in the neighborhood of the other.

    Returns:
        The indices of the neighbors (Don't include the point itself).
    """
    return [j for j in range(self.n) if i != j and self.distance_matrix_[i][j] <= eps]

get_core_objects

get_core_objects(
    eps: float, min_samples: int
) -> tuple[set[int], list[int]]

Get the core objects and noises of the dataset.

PARAMETER DESCRIPTION
eps

The maximum distance between two samples for one to be considered as in the neighborhood of the other.

TYPE: float

min_samples

The number of samples (or total weight) in a neighborhood for a point to be considered as a core point.

TYPE: int

RETURNS DESCRIPTION
core_objects

The indices of the core objects.

TYPE: set[int]

noises

The indices of the noises.

TYPE: list[int]

Source code in toyml/clustering/dbscan.py
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
def get_core_objects(self, eps: float, min_samples: int) -> tuple[set[int], list[int]]:
    """
    Get the core objects and noises of the dataset.

    Args:
        eps: The maximum distance between two samples for one to be considered as in the neighborhood of the other.
        min_samples: The number of samples (or total weight) in a neighborhood for a point to be considered as a core point.

    Returns:
        core_objects: The indices of the core objects.
        noises: The indices of the noises.
    """
    core_objects = set()
    noises = []
    for i in range(self.n):
        neighbors = self.get_neighbors(i, eps)
        if len(neighbors) + 1 >= min_samples:  # +1 to include the point itself
            core_objects.add(i)
        else:
            noises.append(i)
    return core_objects, noises