Repository URL to install this package:
|
Version:
4.5.4.dev1 ▾
|
syntax = "proto3";
import "sarus_data_spec/protobuf/type.proto";
import "sarus_data_spec/protobuf/path.proto";
import "sarus_data_spec/protobuf/scalar.proto";
message Transform {
// A dataset transform
string uuid = 1; // e.g. RFC 4122 id used to refer to the transform
string name = 2;
string doc = 3;
Spec spec = 4;
map<string, string> properties = 5; // Other properties
bool inversible = 6;
bool schema_preserving = 7;
// Definitions
message Spec {
oneof spec {
Identity identity = 1;
Variable variable = 2;
Composed composed = 3;
Project project = 4;
Filter filter = 5;
Shuffle shuffle = 6;
Join join = 7;
Cast cast = 8;
Sample sample = 9;
UserSettings user_settings = 10;
PrivacyUnitTracking privacy_unit_tracking = 11;
External external = 12; // np transforms, pd transforms,...
Synthetic synthetic = 13;
Transcode transcode = 14;
InverseTranscode inverse_transcode = 15;
GetItem get_item = 16;
PrivacyUnitTrackingPaths privacy_unit_tracking_paths = 17;
AutomaticUserSettings automatic_user_settings = 18;
PublicPaths public_paths = 19;
AssignBudget assign_budget = 20;
AutomaticBudget automatic_budget = 21;
AttributesBudget attribute_budget = 22;
SDBudget sd_budget = 23;
DeriveSeed derive_seed = 24;
GroupByPE group_by_pe = 25;
SamplingRatios sampling_ratios = 26; //not used anymore
SelectSql select_sql = 27;
Extract extract = 28;
RelationshipSpec relationship_spec = 29;
DifferentiatedSample differentiated_sample = 30;
ValidatedUserType validated_user_type = 31;
ErrorEstimation error_estimation = 33;
FitModel fit_model = 34;
FitModelDP fit_model_dp = 35;
GenerateFromModel generate_from_model = 36;
ToSmallData to_small_data = 37;
PushSQL push_sql = 38;
SelectTable select_table=39;
}
}
message External {
bytes arguments = 1;
bytes named_arguments = 2;
OpIdentifier op_identifier = 3;
message OpIdentifier{
oneof op {
Std std = 1;
Pandas pandas = 2;
Numpy numpy = 3;
Tensorflow tensorflow = 4;
Sklearn sklearn = 5;
PandasProfiling pandas_profiling = 6;
XGBoost xgboost = 7;
Skopt skopt = 8;
Imblearn imblearn = 9;
Shap shap = 10;
Scipy scipy = 11;
OptBinning optbinning = 12;
}
};
message Std {
string name = 1;
}
message Pandas {
string name = 1;
}
message Numpy {
string name = 1;
}
message Tensorflow {
string name = 1;
}
message Sklearn {
string name = 1;
}
message PandasProfiling {
string name = 1;
}
message XGBoost {
string name = 1;
}
message Skopt {
string name = 1;
}
message Imblearn {
string name = 1;
}
message Shap {
string name = 1;
}
message Scipy {
string name = 1;
}
message OptBinning {
string name = 1;
}
}
message Identity {} // Does nothing
message Variable {
int32 position = 1;
string name = 2;
} // Numbered or named identity to use as input of the composed transform
message Composed {
string transform = 1; // Transform
repeated string arguments = 2; // Arguments of the current transform are transforms
map<string, string> named_arguments = 3;
}
message Project {
sarus_data_spec.Type projection = 1; // This should be a 'supertype' the type the data can project into.
// For product types (struct or tuple), this is a type with a subset of the fields
// For map types, this is a map type with subset (subtype) of key and superset (supertype) of value
// For union types, this is a type with more terms in the union
// To start wit let's simply use this with structs
}
message Filter {
sarus_data_spec.Type filter = 1; // This should be a 'subtype' the type the data can be retricted to.
// For union types, this is a type with less terms in the union.
// Optional types can for instance be filtered to non-optional
// Value type and Predicate types can be used to restrict values
}
message Shuffle {
}
message Join {
sarus_data_spec.Type on = 1; // This should be a common 'supertype' between tables.
}
message Cast {
sarus_data_spec.Type type = 1; // Type to cast into.
}
message Sample {
// Sample a dataset
oneof proportion {
double fraction = 1;
int64 size = 2;
}
sarus_data_spec.Scalar seed = 3;
}
message SchemaInference {
CastPolicy cast_policy = 1;
enum CastPolicy {
NONE = 0;
MOST_LIKELY = 1;
}
}
message GroupBy {
string key = 1;
}
message ToSmallData {
int32 size = 1;
bool random_sampling = 2;
sarus_data_spec.Scalar seed = 3;
}
message PushSQL {
string dataconnection_name = 1;
string schema_name = 2;
string table_name = 3;
string uri = 4;
}
message Synthetic {
}
message UserSettings {
}
message AutomaticUserSettings {
int64 max_categories = 1;
int64 batch_size_sample = 2;
int64 max_table_size_sample = 3;
double sampling_ratio = 4;
}
message PrivacyUnitTracking {
}
message Transcode {
}
message InverseTranscode {
}
message DifferentiatedSample{
oneof proportion {
double fraction = 1;
int64 size = 2;
}
sarus_data_spec.Scalar seed = 3;
}
message PrivacyUnitTrackingPaths {
}
message PublicPaths {
}
message GetItem {
sarus_data_spec.Path path = 1;
}
message AssignBudget {
}
message AutomaticBudget {
}
message AttributesBudget {
}
message SDBudget {
}
message DeriveSeed {
int64 random_integer = 1;
}
message GroupByPE {
}
message SamplingRatios {
}
message RelationshipSpec {
}
message SelectSql {
oneof select {
string query = 1;
AliasedQueries aliased_queries = 2;
}
SQLDialect sql_dialect = 3;
OpIdentifier op_identifier = 4;
message OpIdentifier {
SqlOp sql_op = 1;
enum SqlOp {
NONE = 0;
PUP = 1;
DP = 2;
}
}
}
enum SQLDialect {
NONE = 0;
POSTGRES = 1;
SQL_SERVER = 2;
MY_SQL = 3;
SQLLITE = 4;
ORACLE = 5;
BIG_QUERY = 6;
REDSHIFT = 7;
HIVE = 8;
}
message AliasedQueries {
repeated AliasedQuery aliased_query = 1;
}
message AliasedQuery {
sarus_data_spec.Path path = 1;
string query = 2;
}
message Extract {
int32 size = 1;
}
message ValidatedUserType {
}
message ErrorEstimation {
}
message FitModel {
int32 batch_size = 1;
int32 epochs = 2;
TextKind text_kind = 3;
bool quantize=4;
bool use_lora=5;
double learning_rate=6;
repeated string lora_attn_modules=7;
bool apply_lora_to_mlp=8;
bool apply_lora_to_output=9;
int32 lora_rank=10;
int32 lora_alpha=11;
}
message TextKind {
oneof text_kind {
string text_field = 1;
Chat chat = 2;}
}
message Chat {
string question_field = 1;
string answer_field = 2;
}
message FitModelDP {
int32 batch_size = 1;
int32 epochs = 3;
double l2_norm_clip = 4;
TextKind text_kind = 5;
bool quantize=6;
bool use_lora=7;
double learning_rate=8;
repeated string lora_attn_modules=9;
bool apply_lora_to_mlp=10;
bool apply_lora_to_output=11;
int32 lora_rank=12;
int32 lora_alpha=13;
}
message GenerateFromModel {
int32 max_new_tokens = 1;
double temperature = 2;
}
message SelectTable {
sarus_data_spec.Path path = 1;
}
}