No tienes acceso a esta clase

¡Continúa aprendiendo! Únete y comienza a potenciar tu carrera

Evaluando resultados de hierarchical clustering

13/27
Recursos

Aportes 6

Preguntas 2

Ordenar por:

¿Quieres ver más aportes, preguntas y respuestas de la comunidad?

Lo mismo pero con plotly, me guie de este codigo https://chart-studio.plotly.com/~Diksha_Gabha/2853.embed, pero le solucione errores que tenia que no permitian visualizar algunas funciones y sustitui librerias antiguas por sus equivalentes actuales.

``````import plotly.graph_objects as go
from plotly.subplots import make_subplots

from __future__ import print_function

from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np

range_n_clusters = [2, 3, 4, 5, 6]

figures = []

for n_clusters in range_n_clusters:
# Create a subplot with 1 row and 2 columns
fig = make_subplots(rows=1, cols=2,
print_grid=False,
subplot_titles=('The silhouette plot for the various clusters.',
'The visualization of the clustered data.'))

# The 1st subplot is the silhouette plot
# The silhouette coefficient can range from -1, 1 but in this example all
# lie within [-0.1, 1]
fig['layout']['xaxis1'].update(title='The silhouette coefficient values',
range=[-0.1, 1])

# The (n_clusters+1)*10 is for inserting blank space between silhouette
# plots of individual clusters, to demarcate them clearly.
fig['layout']['yaxis1'].update(title='Cluster label',
showticklabels=False,
range=[0, len(X) + (n_clusters + 1) * 10])

# Initialize the clusterer with n_clusters value and a random generator
# seed of 10 for reproducibility.
clusterer = AgglomerativeClustering(n_clusters=n_clusters, metric='euclidean', linkage='ward')
cluster_labels = clusterer.fit_predict(X)

# The silhouette_score gives the average value for all the samples.
# This gives a perspective into the density and separation of the formed
# clusters
silhouette_avg = silhouette_score(X, cluster_labels)
print(
"For n_clusters =",
n_clusters,
"The average silhouette_score is :",
silhouette_avg,
)

# Compute the silhouette scores for each sample
sample_silhouette_values = silhouette_samples(X, cluster_labels)

y_lower = 10

color = []

for i in range(n_clusters):
# Aggregate the silhouette scores for samples belonging to
# cluster i, and sort them
ith_cluster_silhouette_values = \
sample_silhouette_values[cluster_labels == i]

ith_cluster_silhouette_values.sort()

size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i

colors = matplotlib.colors.colorConverter.to_rgb(cm.nipy_spectral(float(i) / n_clusters))
colors = 'rgb'+str(colors)
color.append(colors)
filled_area = go.Scatter(y=np.arange(y_lower, y_upper),
x=ith_cluster_silhouette_values,
mode='lines',
showlegend=False,
line=dict(width=0.5,
color=colors),
fill='tozerox',
name='Silhouette')

# Compute the new y_lower for next plot
y_lower = y_upper + 10  # 10 for the 0 samples

# The vertical line for average silhouette score of all the values
axis_line = go.Scatter(x=[silhouette_avg, silhouette_avg],
y=[0, y_upper],
showlegend=False,
mode='lines',
line=dict(color="red", dash='dash',
width =2) )

fig.append_trace(axis_line, 1, 1)

# 2nd Plot showing the actual clusters formed
clusters = go.Scatter(x=X[:, 0],
y=X[:, 1],
showlegend=False,
mode='markers',
marker=dict(color=cluster_labels,
size=4, colorscale=color),
name='Data'
)
fig.append_trace(clusters, 1, 2)

#         # Labeling the clusters
#         centers_ = clusterer.cluster_centers_
#         # Draw white circles at cluster centers
#         centers = go.Scatter(x=centers_[:, 0],
#                              y=centers_[:, 1],
#                              showlegend=False,
#                              mode='markers',
#                              marker=dict(color='green', size=10,
#                                          line=dict(color='black',
#                                                                  width=1))
#                             )

#     fig.append_trace(centers, 1, 2)

fig['layout']['xaxis2'].update(title='Feature space for the 1st feature',
zeroline=False)
fig['layout']['yaxis2'].update(title='Feature space for the 2nd feature',
zeroline=False)

fig['layout'].update(title="Silhouette analysis for KMeans clustering on sample data "
"with n_clusters = %d" % n_clusters)

fig.update_layout(showlegend=True)
figures.append(fig)
fig.show()
``````

Quizas requieras:

``````# !pip install chart_studio
``````

Esta función tal vez pueda ser de ayuda para visualizar el silhouette_score(avg) y el silhouette_samples.

``````def plot_silhouette(df,X,y, clust_var_name=""):
"""
Args:
- clust_var_name:string
Nombre columna con el valor del cluster asignado a cada observacion(y_pred)
- df: DataFrame con los features y la columna clust_var_name
- X: np.Array
Array con los features
- y: np.Array
Array con los valores de la prediccion del cluster
"""
silhouette = round(silhouette_score(X,y),2)
samples = silhouette_samples(X,y)
df["silhouette_samples"] = samples
clusters = df[clust_var_name].unique()
n_samples = len(df)
y_pos = n_samples*0.05
for i,cluster in enumerate(clusters):
df_aux = df[df[clust_var_name] == cluster].sort_values("silhouette_samples")
plt.figure(i)
df_aux["silhouette_samples"].plot(kind="barh")
plt.vlines(x=silhouette,ymin=0,ymax=n_samples,linestyles="--",color ='red')
plt.text(x=0.8,y=y_pos,s=f"avg_silhouette_score: {silhouette}")
plt.xlabel("silhouette_score")
plt.ylabel("n_sample")
plt.yticks([])
plt.title(f"Sample silhouettes for custer {cluster}")
``````

Excelentes explicaciones, hacia falta curso¡

¡Buena clase! Sigo este curso con un dataset de transacciones con y sin fraude, comparto en la imagen la gráfica de mi índice de silhouette para 2 clusters, estoy aún dandole una explicación porque de los 400 puntos o trasacciones solo 2 corresponden al cluster 1 y 398 al cluster 2, y por ello el ancho tan delgado y grueso respectivamente. Entendería que aún debo seguir trabajando en la variables de mi dataset para tener mejores resultados. ¡Todo feedback es bienbenido! Gracias de antemano.

Uff me encanto esta clase, mas que todo la forma tan clara de ver los K , con los graficos de velas y su respectiva clasificacion al lado.💪💪💪💪

``````from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.cm as cm

range_n_clusters = [2,3,4,5]

for n_clusters in range_n_clusters:
# Create a subplot with 1 row and 2 columns
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(18, 7)

# The 1st subplot is the silhouette plot
# The silhouette coefficient can range from -1, 1 but in this example all
# lie within [-0.1, 1]
ax1.set_xlim([-0.1, 1])
# The (n_clusters+1)*10 is for inserting blank space between silhouette
# plots of individual clusters, to demarcate them clearly.
ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])

# Initialize the clusterer with n_clusters value and a random generator
# seed of 10 for reproducibility.
clusterer = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='ward')
cluster_labels = clusterer.fit_predict(X)

# The silhouette_score gives the average value for all the samples.
# This gives a perspective into the density and separation of the formed
# clusters
silhouette_avg = silhouette_score(X, cluster_labels)
print(
"For n_clusters =",
n_clusters,
"The average silhouette_score is :",
silhouette_avg,
)

# Compute the silhouette scores for each sample
sample_silhouette_values = silhouette_samples(X, cluster_labels)

y_lower = 10
for i in range(n_clusters):
# Aggregate the silhouette scores for samples belonging to
# cluster i, and sort them
ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]

ith_cluster_silhouette_values.sort()

size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i

color = cm.nipy_spectral(float(i) / n_clusters)
ax1.fill_betweenx(
np.arange(y_lower, y_upper),
0,
ith_cluster_silhouette_values,
facecolor=color,
edgecolor=color,
alpha=0.7,
)

# Label the silhouette plots with their cluster numbers at the middle
ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

# Compute the new y_lower for next plot
y_lower = y_upper + 10  # 10 for the 0 samples

ax1.set_title("The silhouette plot for the various clusters.")
ax1.set_xlabel("The silhouette coefficient values")
ax1.set_ylabel("Cluster label")

# The vertical line for average silhouette score of all the values
ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

ax1.set_yticks([])  # Clear the yaxis labels / ticks
ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

# 2nd Plot showing the actual clusters formed
colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
ax2.scatter(
X[:, 0], X[:, 1], marker=".", s=30, lw=0, alpha=0.7, c=colors, edgecolor="k"
)

plt.show()
``````