Skip to content

Dataset

This section shows how to create, modify and list datasets in AI & Analytics Engine using the SDK

Creating a dataset

Here we show an example of creating a dataset by uploading a csv file from the local file system.

from aiaengine import Org, Project, FileSource, Column, SemanticType

# create a new demo project in the org
org = Org(id='b6240512-cd17-43a0-8297-84c51c1bc5a0') # replace with your org ID
project = org.create_project(name="Demo project using Python SDK", description="Your demo project")
# or you can get an existing project that you want to work on
# project = Project(id='ID_of_your_project') # replace with your own project ID

# import the `German Credit Data` dataset
data_file = 'examples/datasets/german-credit.csv'
# You can use the `print_schema` utility function to print the auto-inferred schema
# print_schema(pd.read_csv(data_file, header=0))

dataset = project.create_dataset(
    name=f"German Credit Data",
    data_source=FileSource(
        file_urls=[data_file],
        schema=[
            Column('checking_status', SemanticType.Text),
            Column('duration', SemanticType.Numeric),
            Column('credit_history', SemanticType.Text),
            Column('purpose', SemanticType.Text),
            Column('credit_amount', SemanticType.Numeric),
            Column('savings_status', SemanticType.Text),
            Column('employment', SemanticType.Text),
            Column('installment_commitment', SemanticType.Numeric),
            Column('personal_status', SemanticType.Text),
            Column('other_parties', SemanticType.Text),
            Column('residence_since', SemanticType.Numeric),
            Column('property_magnitude', SemanticType.Text),
            Column('age', SemanticType.Numeric),
            Column('other_payment_plans', SemanticType.Text),
            Column('housing', SemanticType.Text),
            Column('existing_credits', SemanticType.Numeric),
            Column('job', SemanticType.Text),
            Column('num_dependents', SemanticType.Numeric),
            Column('own_telephone', SemanticType.Text),
            Column('foreign_worker', SemanticType.Text),
            Column('class', SemanticType.Text)
        ]
    )
)
print(dataset.id)
package com.aiaengine.examples.dataset;

import com.aiaengine.Dataset;
import com.aiaengine.Engine;
import com.aiaengine.Org;
import com.aiaengine.Project;
import com.aiaengine.datasource.DataSource;
import com.aiaengine.datasource.Schema;
import com.aiaengine.datasource.file.CSVFileSettings;
import com.aiaengine.datasource.file.FileSourceRequest;
import com.aiaengine.datasource.file.FileType;
import com.aiaengine.org.request.CreateProjectRequest;
import com.aiaengine.project.request.CreateDatasetRequest;

import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.List;

public class ImportCsvApp {
    public static void main(String[] args) throws FileNotFoundException {
        Engine engine = new Engine();
        // create a new demo project in the org
        Org org = engine.getOrg("cae24b10-e6b0-4d61-8cef-a9f4b8f6133d"); // replace with your org ID
        Project project = org.createProject(CreateProjectRequest.builder()
                .name("Demo project using Java SDK")
                .description("Your demo project")
                .build());
        // or you can get an existing project that you want to work on
        // Project project = engine.getProject("ID_of_your_project") // replace with your own project ID

        String dataFilePath = "examples/datasets/german-credit.csv";
        List<Schema.Column> columns = new ArrayList<>();
        columns.add(new Schema.Column("checking_status", Schema.SemanticType.TEXT));
        columns.add(new Schema.Column("duration", Schema.SemanticType.NUMERIC));
        columns.add(new Schema.Column("credit_history", Schema.SemanticType.TEXT));
        columns.add(new Schema.Column("purpose", Schema.SemanticType.TEXT));
        columns.add(new Schema.Column("credit_amount", Schema.SemanticType.NUMERIC));
        columns.add(new Schema.Column("savings_status", Schema.SemanticType.TEXT));
        columns.add(new Schema.Column("employment", Schema.SemanticType.TEXT));
        columns.add(new Schema.Column("installment_commitment", Schema.SemanticType.NUMERIC));
        columns.add(new Schema.Column("personal_status", Schema.SemanticType.TEXT));
        columns.add(new Schema.Column("other_parties", Schema.SemanticType.TEXT));
        columns.add(new Schema.Column("residence_since", Schema.SemanticType.NUMERIC));
        columns.add(new Schema.Column("property_magnitude", Schema.SemanticType.TEXT));
        columns.add(new Schema.Column("age", Schema.SemanticType.NUMERIC));
        columns.add(new Schema.Column("other_payment_plans", Schema.SemanticType.TEXT));
        columns.add(new Schema.Column("housing", Schema.SemanticType.TEXT));
        columns.add(new Schema.Column("existing_credits", Schema.SemanticType.NUMERIC));
        columns.add(new Schema.Column("job", Schema.SemanticType.TEXT));
        columns.add(new Schema.Column("num_dependents", Schema.SemanticType.NUMERIC));
        columns.add(new Schema.Column("own_telephone", Schema.SemanticType.TEXT));
        columns.add(new Schema.Column("foreign_worker", Schema.SemanticType.TEXT));
        columns.add(new Schema.Column("class", Schema.SemanticType.TEXT));
        DataSource localDataSource = engine.buildFileSource(FileSourceRequest.builder()
                .fileType(FileType.CSV)
                .url(dataFilePath)
                .fileSettings(new CSVFileSettings())
                .schema(new Schema(columns))
                .build());

        Dataset dataset = project.createDataset(CreateDatasetRequest.builder()
                .name("German Credit Data")
                .dataSource(localDataSource)
                .timeout(900)
                .build());

        System.out.println(dataset.getId());
    }
}

Refer Data Import for more examples.

Retrieving information of a dataset

Once a dataset is imported into the platform, you can get information about this dataset with input of the dataset ID.

import os
from aiaengine import Dataset

dataset_id = os.environ.get('DATASET_ID', '')
dataset = Dataset(id=dataset_id)
print(dataset.name)
package com.aiaengine.examples.dataset;

import com.aiaengine.Dataset;
import com.aiaengine.Engine;

import java.io.FileNotFoundException;

public class GetDatasetApp {
    public static void main(String[] args) throws FileNotFoundException {
        Engine engine = new Engine();
        Dataset ds = engine.getDataset("f55717be-14dd-40ea-8cad-f71370d5b961");
        System.out.println(ds.getName());
    }
}

Updating a dataset

You can also modify the name and description of an existing dataset in the platform.

import os
from aiaengine import Dataset

dataset_id = os.environ.get('DATASET_ID', '')
dataset = Dataset(id=dataset_id)
print(dataset.name)
package com.aiaengine.examples.dataset;

import com.aiaengine.Dataset;
import com.aiaengine.Engine;
import com.aiaengine.org.request.UpdateOrgRequest;

import java.io.FileNotFoundException;

public class UpdateDatasetApp {
    public static void main(String[] args) throws FileNotFoundException {
        Engine engine = new Engine();
        Dataset ds = engine.getDataset("93955467-8139-4423-9a02-d6cb4c422927");
        ds.update(UpdateOrgRequest.builder()
                .name("Updated Dataset")
                .description("This dataset has been updated")
                .build());
    }
}

Listing datasets in a project

For a particular project, you can list all datasets by giving the project id.

import os
from aiaengine import Project

project_id = os.environ.get('PROJECT_ID', '')
project = Project(id=project_id)

datasets = project.list_datasets()
print(datasets)
package com.aiaengine.examples.dataset;

import com.aiaengine.Dataset;
import com.aiaengine.Engine;
import com.aiaengine.Project;

import java.io.FileNotFoundException;
import java.util.List;

public class ListDatasetApp {
    public static void main(String[] args) throws FileNotFoundException {
        Engine engine = new Engine();
        // you can get an existing project that you want to work on
        Project project = engine.getProject("403a448d-9d86-497f-a9f6-414afa72a415");
        List<Dataset> datasets = project.listDatasets();
        datasets.forEach(ds -> System.out.println(ds.getName()));
    }
}

Deleting a dataset

If a particular dataset is no longer in use, you can remove it by specifying the dataset id.

import os
from aiaengine import Dataset

dataset_id = os.environ.get('DATASET_ID', '')
dataset = Dataset(id=dataset_id)

dataset.delete()
package com.aiaengine.examples.dataset;

import com.aiaengine.Dataset;
import com.aiaengine.Engine;

import java.io.FileNotFoundException;

public class DeleteDatasetApp {
    public static void main(String[] args) throws FileNotFoundException {
        Engine engine = new Engine();
        Dataset ds = engine.getDataset("f55717be-14dd-40ea-8cad-f71370d5b961");
        ds.delete();
    }
}