To create a new dataset you need to define its name and settings. Optional parameters are workspace and client, if you want to create the dataset in a specific workspace or on a specific Argilla instance.
classDataset(Resource,DiskImportExportMixin):"""Class for interacting with Argilla Datasets Attributes: name: Name of the dataset. records (DatasetRecords): The records object for the dataset. Used to interact with the records of the dataset by iterating, searching, etc. settings (Settings): The settings object of the dataset. Used to configure the dataset with fields, questions, guidelines, etc. fields (list): The fields of the dataset, for example the `rg.TextField` of the dataset. Defined in the settings. questions (list): The questions of the dataset defined in the settings. For example, the `rg.TextQuestion` that you want labelers to answer. guidelines (str): The guidelines of the dataset defined in the settings. Used to provide instructions to labelers. allow_extra_metadata (bool): True if extra metadata is allowed, False otherwise. """name:strid:Optional[UUID]_api:"DatasetsAPI"_model:"DatasetModel"def__init__(self,name:Optional[str]=None,workspace:Optional[Union["Workspace",str]]=None,settings:Optional[Settings]=None,client:Optional["Argilla"]=None,_model:Optional[DatasetModel]=None,)->None:"""Initializes a new Argilla Dataset object with the given parameters. Parameters: name (str): Name of the dataset. Replaced by random UUID if not assigned. workspace (UUID): Workspace of the dataset. Default is the first workspace found in the server. settings (Settings): Settings class to be used to configure the dataset. client (Argilla): Instance of Argilla to connect with the server. Default is the default client. _model (DatasetModel): Model of the dataset. Used to create the dataset from an existing model. """client=clientorArgilla._get_default()super().__init__(client=client,api=client.api.datasets)ifnameisNone:name=f"dataset_{uuid4()}"self._log_message(f"Settings dataset name to unique UUID: {name}")self.workspace_id=(_model.workspace_idif_modeland_model.workspace_idelseself.__workspace_id_from_name(workspace=workspace))self._model=_modelorDatasetModel(name=name,workspace_id=UUIDUtilities.convert_optional_uuid(uuid=self.workspace_id),)self._settings=self.__configure_settings_for_dataset(settings=settings)self.__records=DatasetRecords(client=self._client,dataset=self)###################### Properties ######################@propertydefname(self)->str:returnself._model.name@name.setterdefname(self,value:str)->None:self._model.name=value@propertydefrecords(self)->"DatasetRecords":returnself.__records@propertydefsettings(self)->Settings:ifself.__is_published()andself._settings.is_outdated:self._settings.get()returnself._settings@settings.setterdefsettings(self,value:Settings)->None:self._settings=self.__configure_settings_for_dataset(settings=value)@propertydeffields(self)->list:returnself.settings.fields@propertydefquestions(self)->list:returnself.settings.questions@propertydefguidelines(self)->str:returnself.settings.guidelines@guidelines.setterdefguidelines(self,value:str)->None:self.settings.guidelines=value@propertydefallow_extra_metadata(self)->bool:returnself.settings.allow_extra_metadata@allow_extra_metadata.setterdefallow_extra_metadata(self,value:bool)->None:self.settings.allow_extra_metadata=value@propertydefschema(self)->dict:returnself.settings.schema###################### Core methods ######################defexists(self)->bool:"""Checks if the dataset exists on the server Returns: bool: True if the dataset exists, False otherwise """returnself.idandself._api.exists(self.id)defcreate(self)->"Dataset":"""Creates the dataset on the server with the `Settings` configuration. Returns: Dataset: The created dataset object. """super().create()try:returnself._publish()exceptExceptionase:self._log_message(message=f"Error creating dataset: {e}",level="error")self.__rollback_dataset_creation()raiseSettingsErrorfromedefupdate(self)->"Dataset":"""Updates the dataset on the server with the current settings. Returns: Dataset: The updated dataset object. """self.settings.update()returnself@classmethoddeffrom_model(cls,model:DatasetModel,client:"Argilla")->"Dataset":returncls(client=client,_model=model)###################### Utility methods ######################def_publish(self)->"Dataset":self._settings.create()self._api.publish(dataset_id=self._model.id)returnself.get()# type: ignoredef__configure_settings_for_dataset(self,settings:Optional[Settings]=None,)->Settings:ifsettingsisNone:settings=Settings(_dataset=self)warnings.warn(message="Settings not provided. Using empty settings for the dataset. \ Define the settings before creating the dataset.",stacklevel=2,)else:settings.dataset=selfreturnsettingsdef__workspace_id_from_name(self,workspace:Optional[Union["Workspace",str]])->UUID:ifworkspaceisNone:available_workspaces=self._client.workspacesws=available_workspaces[0]# type: ignorewarnings.warn(f"Workspace not provided. Using default workspace: {ws.name} id: {ws.id}")elifisinstance(workspace,str):available_workspace_names=[ws.nameforwsinself._client.workspaces]ws=self._client.workspaces(workspace)ifnotws.exists():self._log_message(message=f"Workspace with name {workspace} not found. \ Available workspaces: {available_workspace_names}",level="error",)raiseNotFoundError()else:ws=workspacereturnws.iddef__rollback_dataset_creation(self):ifself.exists()andnotself.__is_published():self.delete()def__is_published(self)->bool:returnself.exists()andself._model.status=="ready"
def__init__(self,name:Optional[str]=None,workspace:Optional[Union["Workspace",str]]=None,settings:Optional[Settings]=None,client:Optional["Argilla"]=None,_model:Optional[DatasetModel]=None,)->None:"""Initializes a new Argilla Dataset object with the given parameters. Parameters: name (str): Name of the dataset. Replaced by random UUID if not assigned. workspace (UUID): Workspace of the dataset. Default is the first workspace found in the server. settings (Settings): Settings class to be used to configure the dataset. client (Argilla): Instance of Argilla to connect with the server. Default is the default client. _model (DatasetModel): Model of the dataset. Used to create the dataset from an existing model. """client=clientorArgilla._get_default()super().__init__(client=client,api=client.api.datasets)ifnameisNone:name=f"dataset_{uuid4()}"self._log_message(f"Settings dataset name to unique UUID: {name}")self.workspace_id=(_model.workspace_idif_modeland_model.workspace_idelseself.__workspace_id_from_name(workspace=workspace))self._model=_modelorDatasetModel(name=name,workspace_id=UUIDUtilities.convert_optional_uuid(uuid=self.workspace_id),)self._settings=self.__configure_settings_for_dataset(settings=settings)self.__records=DatasetRecords(client=self._client,dataset=self)
defcreate(self)->"Dataset":"""Creates the dataset on the server with the `Settings` configuration. Returns: Dataset: The created dataset object. """super().create()try:returnself._publish()exceptExceptionase:self._log_message(message=f"Error creating dataset: {e}",level="error")self.__rollback_dataset_creation()raiseSettingsErrorfrome
defexists(self)->bool:"""Checks if the dataset exists on the server Returns: bool: True if the dataset exists, False otherwise """returnself.idandself._api.exists(self.id)
defupdate(self)->"Dataset":"""Updates the dataset on the server with the current settings. Returns: Dataset: The updated dataset object. """self.settings.update()returnself