Clustering
Using SDK to create clustering applications
This guide shows you how to create a clustering application on the AI & Analytics Engine.
You can download the dataset used in this example here: insurance-customers-segmentation.csv.
from aiaengine import Org
from aiaengine import FileSource, Column, DataType
from aiaengine import ClusteringConfig, Clusterers, HyperparameterTuningMethod
from aiaengine import get_all_clustering_results
# create a new demo project in the org
org = Org(id='b6240512-cd17-43a0-8297-84c51c1bc5a0') # replace with your org ID
project = org.create_project(name="Demo project using Python SDK", description="Your demo project")
# or you can get an existing project that you want to work on
# project = Project(id='ID_of_your_project') # replace with your own project ID
# import the dataset
dataset = project.create_dataset(
name='Insurance customers',
data_source=FileSource(
file_urls=['examples/datasets/insurance-customers-segmentation.csv'],
schema=[
Column("Customer", DataType.Text),
Column("State", DataType.Text),
Column("Response", DataType.Text),
Column("Coverage", DataType.Text),
Column("Education", DataType.Text),
Column("Effective To Date", DataType.DateTime),
Column("EmploymentStatus", DataType.Text),
Column("Gender", DataType.Text),
Column("Income", DataType.Numeric),
Column("Location Code", DataType.Text),
Column("Marital Status", DataType.Text),
Column("Monthly Premium Auto", DataType.Numeric),
Column("Months Since Last Claim", DataType.Numeric),
Column("Months Since Policy Inception", DataType.Numeric),
Column("Number of Open Complaints", DataType.Numeric),
Column("Number of Policies", DataType.Numeric),
Column("Policy Type", DataType.Text),
Column("Policy", DataType.Text),
Column("Renew Offer Type", DataType.Text),
Column("Sales Channel", DataType.Text),
Column("Total Claim Amount", DataType.Numeric),
Column("Vehicle Class", DataType.Text),
Column("Vehicle Size", DataType.Text)
]
)
)
# create a clustering application
app = project.create_app(
name='Insurance Customer Segmentation - Clustering',
dataset_id=dataset.id,
config=ClusteringConfig(
feature_set={
'selected_features': [
"State",
"Response",
"Coverage",
"Education",
"EmploymentStatus",
"Gender",
"Income",
"Location Code",
"Marital Status",
"Monthly Premium Auto",
"Months Since Last Claim",
"Months Since Policy Inception",
"Number of Open Complaints",
"Number of Policies",
"Policy Type",
"Policy",
"Renew Offer Type",
"Sales Channel",
"Total Claim Amount",
"Vehicle Class",
"Vehicle Size",
]
},
initial_models=[
{
'name': 'GMM',
'template_id': Clusterers.GMM,
'training_config':{
'hyperparameter_tuning_method': HyperparameterTuningMethod.GRID_SEARCH,
'hyperparameter_tuning_config': None,
'hyperparameters': {
'k': {
'max_num_clusters': 5,
'min_num_clusters': 2
}
},
}
},
{
'name': 'UMAP_HDBSCAN',
'template_id': Clusterers.UMAP_HDBSCAN,
'training_config': {
'hyperparameter_tuning_method': None,
'hyperparameter_tuning_config': None,
'hyperparameters': {},
}
}
]
)
)
# get the clustering results
clustering_results = get_all_clustering_results(app)