-
Notifications
You must be signed in to change notification settings - Fork 2
/
dbscan-code-5
42 lines (36 loc) · 1.53 KB
/
dbscan-code-5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
X_train = df[['AGE','BMI']]
X_train = X_train.fillna(X_train.mean())
k_values = range(4, 11)
optimal_epsilons = []
plt.figure(figsize=(15, 10))
for k in k_values:
neighbors = NearestNeighbors(n_neighbors=k)
neighbors_fit = neighbors.fit(X_train)
distances, indices = neighbors_fit.kneighbors(X_train)
mean_distances = distances.mean(axis=1)
mean_distances = np.sort(mean_distances)
kneedle = KneeLocator(range(len(mean_distances)), mean_distances, S=1.0, curve='convex', direction='increasing')
optimal_k = kneedle.elbow
optimal_epsilon = mean_distances[optimal_k]
optimal_epsilons.append((k, optimal_epsilon))
plt.plot(range(len(mean_distances)), mean_distances, label=f'k={k}')
plt.axvline(x=optimal_k, color='r', linestyle='--')
plt.xlabel('Points sorted by distance')
plt.ylabel('Mean Distance to k Nearest Neighbors')
plt.title('Elbow Plot for Different k Values')
plt.legend()
plt.show()
for k, epsilon in optimal_epsilons:
print(f'Optimal epsilon for k={k}: {epsilon}')
dbscan = DBSCAN(eps=3.2012313926500067, min_samples=4)
dbscan.fit(X_train)
df['cluster'] = dbscan.labels_
print(df['cluster'].value_counts())
print(len(cluster_labels))
outliers = df[df['cluster'] == -1]
clusters = df[df['cluster'] != -1]
plt.figure(figsize=(8, 6))
sns.scatterplot(x='AGE', y='BMI', data=clusters, hue='cluster', palette='Dark2', s=300)
plt.scatter(outliers['AGE'], outliers['BMI'], color='black', s=10, label='noise', marker='o')
plt.legend(loc='upper right', bbox_to_anchor=(1.17, 1), title='Clusters')
plt.show()