No tienes acceso a esta clase

¡Continúa aprendiendo! Únete y comienza a potenciar tu carrera

Evaluando resultados de hierarchical clustering

13/27
Recursos

Aportes 8

Preguntas 2

Ordenar por:

¿Quieres ver más aportes, preguntas y respuestas de la comunidad?

Lo mismo pero con plotly, me guie de este codigo https://chart-studio.plotly.com/~Diksha_Gabha/2853.embed, pero le solucione errores que tenia que no permitian visualizar algunas funciones y sustitui librerias antiguas por sus equivalentes actuales.

import plotly.graph_objects as go
from plotly.subplots import make_subplots


from __future__ import print_function

from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np

range_n_clusters = [2, 3, 4, 5, 6]

figures = []

for n_clusters in range_n_clusters:
    # Create a subplot with 1 row and 2 columns
    fig = make_subplots(rows=1, cols=2,
                        print_grid=False,
                        subplot_titles=('The silhouette plot for the various clusters.',
                                              'The visualization of the clustered data.'))

    # The 1st subplot is the silhouette plot
    # The silhouette coefficient can range from -1, 1 but in this example all
    # lie within [-0.1, 1]
    fig['layout']['xaxis1'].update(title='The silhouette coefficient values',
                                   range=[-0.1, 1])

    # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    fig['layout']['yaxis1'].update(title='Cluster label',
                                   showticklabels=False,
                                   range=[0, len(X) + (n_clusters + 1) * 10])



    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.
    clusterer = AgglomerativeClustering(n_clusters=n_clusters, metric='euclidean', linkage='ward')
    cluster_labels = clusterer.fit_predict(X)

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = silhouette_score(X, cluster_labels)
    print(
        "For n_clusters =",
        n_clusters,
        "The average silhouette_score is :",
        silhouette_avg,
    )

    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(X, cluster_labels)

    y_lower = 10

    color = []

    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = \
            sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i


        colors = matplotlib.colors.colorConverter.to_rgb(cm.nipy_spectral(float(i) / n_clusters))
        colors = 'rgb'+str(colors)
        color.append(colors)
        filled_area = go.Scatter(y=np.arange(y_lower, y_upper),
                                 x=ith_cluster_silhouette_values,
                                 mode='lines',
                                 showlegend=False,
                                 line=dict(width=0.5,
                                          color=colors),
                                 fill='tozerox',
                                 name='Silhouette')
        fig.add_traces(filled_area, 1, 1)

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples


    # The vertical line for average silhouette score of all the values
    axis_line = go.Scatter(x=[silhouette_avg, silhouette_avg],
                           y=[0, y_upper],
                           showlegend=False,
                           mode='lines',
                           line=dict(color="red", dash='dash',
                                     width =2) )


    fig.append_trace(axis_line, 1, 1)


    # 2nd Plot showing the actual clusters formed
    clusters = go.Scatter(x=X[:, 0], 
                          y=X[:, 1], 
                          showlegend=False,
                          mode='markers',
                          marker=dict(color=cluster_labels,
                                     size=4, colorscale=color),
                          name='Data'
                         )
    fig.append_trace(clusters, 1, 2)

#         # Labeling the clusters
#         centers_ = clusterer.cluster_centers_
#         # Draw white circles at cluster centers
#         centers = go.Scatter(x=centers_[:, 0], 
#                              y=centers_[:, 1],
#                              showlegend=False,
#                              mode='markers',
#                              marker=dict(color='green', size=10,
#                                          line=dict(color='black',
#                                                                  width=1))
#                             )

#     fig.append_trace(centers, 1, 2)

    fig['layout']['xaxis2'].update(title='Feature space for the 1st feature',
                                   zeroline=False)
    fig['layout']['yaxis2'].update(title='Feature space for the 2nd feature',
                                  zeroline=False)


    fig['layout'].update(title="Silhouette analysis for KMeans clustering on sample data "
                         "with n_clusters = %d" % n_clusters)

    fig.update_layout(showlegend=True)
    figures.append(fig)
    fig.show()

Quizas requieras:

# !pip install chart_studio

Esta función tal vez pueda ser de ayuda para visualizar el silhouette_score(avg) y el silhouette_samples.

def plot_silhouette(df,X,y, clust_var_name=""):
    """
    Args:
        - clust_var_name:string  
            Nombre columna con el valor del cluster asignado a cada observacion(y_pred)
        - df: DataFrame con los features y la columna clust_var_name
        - X: np.Array
            Array con los features
        - y: np.Array
            Array con los valores de la prediccion del cluster
    """
    silhouette = round(silhouette_score(X,y),2)
    samples = silhouette_samples(X,y) 
    df["silhouette_samples"] = samples
    clusters = df[clust_var_name].unique()
    n_samples = len(df)
    y_pos = n_samples*0.05
    for i,cluster in enumerate(clusters):
        df_aux = df[df[clust_var_name] == cluster].sort_values("silhouette_samples")
        plt.figure(i)
        df_aux["silhouette_samples"].plot(kind="barh")
        plt.vlines(x=silhouette,ymin=0,ymax=n_samples,linestyles="--",color ='red')
        plt.text(x=0.8,y=y_pos,s=f"avg_silhouette_score: {silhouette}")
        plt.xlabel("silhouette_score")
        plt.ylabel("n_sample")
        plt.yticks([])
        plt.title(f"Sample silhouettes for custer {cluster}")

Excelentes explicaciones, hacia falta curso¡

Uff me encanto esta clase, mas que todo la forma tan clara de ver los K , con los graficos de velas y su respectiva clasificacion al lado.💪💪💪💪

¡Buena clase! Sigo este curso con un dataset de transacciones con y sin fraude, comparto en la imagen la gráfica de mi índice de silhouette para 2 clusters, estoy aún dandole una explicación porque de los 400 puntos o trasacciones solo 2 corresponden al cluster 1 y 398 al cluster 2, y por ello el ancho tan delgado y grueso respectivamente. Entendería que aún debo seguir trabajando en la variables de mi dataset para tener mejores resultados. ¡Todo feedback es bienbenido! Gracias de antemano.

Son dos Algoritmos de gran relevancia en el Machine learning aprendizaje no supervisado, fáciles de comprender y usar acorde a nuestro dataframe, amplia la visión de dendrograma a conjunto de ven, un buen contraste para así comprender El proceso.
Me encantaron los dos, aunque creo que hierarchical clustering es más eficiente para averiguar cuantos tipos diferentes de grupos hay. En cambio, el K-means es más eficiente cuando tenemos ya la cantidad de centroides.
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.cm as cm


range_n_clusters = [2,3,4,5]

for n_clusters in range_n_clusters:
    # Create a subplot with 1 row and 2 columns
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(18, 7)

    # The 1st subplot is the silhouette plot
    # The silhouette coefficient can range from -1, 1 but in this example all
    # lie within [-0.1, 1]
    ax1.set_xlim([-0.1, 1])
    # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])

    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.
    clusterer = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='ward')
    cluster_labels = clusterer.fit_predict(X)

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = silhouette_score(X, cluster_labels)
    print(
        "For n_clusters =",
        n_clusters,
        "The average silhouette_score is :",
        silhouette_avg,
    )

    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(X, cluster_labels)

    y_lower = 10
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / n_clusters)
        ax1.fill_betweenx(
            np.arange(y_lower, y_upper),
            0,
            ith_cluster_silhouette_values,
            facecolor=color,
            edgecolor=color,
            alpha=0.7,
        )

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    # The vertical line for average silhouette score of all the values
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

    # 2nd Plot showing the actual clusters formed
    colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
    ax2.scatter(
        X[:, 0], X[:, 1], marker=".", s=30, lw=0, alpha=0.7, c=colors, edgecolor="k"
    )


plt.show()