Why Gemfury? Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Debian packages RPM packages NuGet packages

Repository URL to install this package:

Details    
sarus_data_spec / sarus_data_spec / protobuf / transform.proto
Size: Mime:
syntax = "proto3";
import "sarus_data_spec/protobuf/type.proto";
import "sarus_data_spec/protobuf/path.proto";
import "sarus_data_spec/protobuf/scalar.proto";

message Transform {
  // A dataset transform
  string uuid = 1; // e.g. RFC 4122 id used to refer to the transform
  string name = 2;
  string doc = 3;
  Spec spec = 4;
  map<string, string> properties = 5; // Other properties
  bool inversible = 6;
  bool schema_preserving = 7;

  // Definitions
  message Spec {
    oneof spec {
      Identity identity = 1;
      Variable variable = 2;
      Composed composed = 3;
      Project project = 4;
      Filter filter = 5;
      Shuffle shuffle = 6;
      Join join = 7;
      Cast cast = 8;
      Sample sample = 9;
      UserSettings user_settings = 10;
      PrivacyUnitTracking privacy_unit_tracking = 11;
      External external = 12; // np transforms, pd transforms,...
      Synthetic synthetic = 13;
      Transcode transcode = 14;
      InverseTranscode inverse_transcode = 15;
      GetItem get_item = 16;
      PrivacyUnitTrackingPaths privacy_unit_tracking_paths = 17;
      AutomaticUserSettings automatic_user_settings = 18;
      PublicPaths public_paths = 19;
      AssignBudget assign_budget = 20;
      AutomaticBudget automatic_budget = 21;
      AttributesBudget attribute_budget = 22;
      SDBudget sd_budget = 23;
      DeriveSeed derive_seed = 24;
      GroupByPE group_by_pe = 25;
      SamplingRatios sampling_ratios = 26; //not used anymore
      SelectSql select_sql = 27;
      Extract extract = 28;
      RelationshipSpec relationship_spec = 29;
      DifferentiatedSample differentiated_sample = 30;
      ValidatedUserType validated_user_type = 31;
      ErrorEstimation error_estimation = 33;
      FitModel fit_model = 34;
      FitModelDP fit_model_dp = 35;
      GenerateFromModel generate_from_model = 36;
      ToSmallData to_small_data = 37;
      PushSQL push_sql = 38;
      SelectTable select_table=39;
    }
  }

  message External {
    bytes arguments = 1;
    bytes named_arguments = 2;
    OpIdentifier op_identifier = 3;

    message OpIdentifier{
      oneof op {
        Std std = 1;
        Pandas pandas = 2;
        Numpy numpy = 3;
        Tensorflow tensorflow = 4;
        Sklearn sklearn = 5;
        PandasProfiling pandas_profiling = 6;
        XGBoost xgboost = 7;
        Skopt skopt = 8;
        Imblearn imblearn = 9;
        Shap shap = 10;
        Scipy scipy = 11;
        OptBinning optbinning = 12;
      }
    };

    message Std {
      string name = 1;
    }

    message Pandas {
      string name = 1;
    }

    message Numpy {
      string name = 1;
    }

    message Tensorflow {
      string name = 1;
    }

    message Sklearn {
      string name = 1;
    }

    message PandasProfiling {
      string name = 1;
    }

    message XGBoost {
      string name = 1;
    }

    message Skopt {
      string name = 1;
    }

    message Imblearn {
      string name = 1;
    }

    message Shap {
      string name = 1;
    }
    message Scipy {
      string name = 1;
    }
    message OptBinning {
      string name = 1;
    }
  }

  message Identity {} // Does nothing

  message Variable {
    int32 position = 1;
    string name = 2;
  } // Numbered or named identity to use as input of the composed transform

  message Composed {
    string transform = 1; // Transform
    repeated string arguments = 2; // Arguments of the current transform are transforms
    map<string, string> named_arguments = 3;
  }

  message Project {
    sarus_data_spec.Type projection = 1; // This should be a 'supertype' the type the data can project into.
    // For product types (struct or tuple), this is a type with a subset of the fields
    // For map types, this is a map type with subset (subtype) of key and superset (supertype) of value
    // For union types, this is a type with more terms in the union
    // To start wit let's simply use this with structs
  }

  message Filter {
    sarus_data_spec.Type filter = 1; // This should be a 'subtype' the type the data can be retricted to.
    // For union types, this is a type with less terms in the union.
    // Optional types can for instance be filtered to non-optional
    // Value type and Predicate types can be used to restrict values
  }

  message Shuffle {
  }

  message Join {
    sarus_data_spec.Type on = 1; // This should be a common 'supertype' between tables.
  }

  message Cast {
    sarus_data_spec.Type type = 1; // Type to cast into.
  }

  message Sample {
    // Sample a dataset
    oneof proportion {
      double fraction = 1;
      int64 size = 2;
    }
    sarus_data_spec.Scalar seed = 3;
  }

  message SchemaInference {
    CastPolicy cast_policy = 1;
    enum CastPolicy {
      NONE = 0;
      MOST_LIKELY = 1;
    }
  }

  message GroupBy {
    string key = 1;
  }

  message ToSmallData {
    int32 size = 1;
    bool random_sampling = 2;
    sarus_data_spec.Scalar seed = 3;
  }

  message PushSQL {
    string dataconnection_name = 1;
    string schema_name = 2;
    string table_name = 3;
    string uri = 4;
  }

  message Synthetic {
  }

  message UserSettings {
  }

  message AutomaticUserSettings {
    int64 max_categories = 1;
    int64 batch_size_sample = 2;
    int64 max_table_size_sample = 3;
    double sampling_ratio = 4;
  }

  message PrivacyUnitTracking {
  }

  message Transcode {
  }

  message InverseTranscode {
  }
  message DifferentiatedSample{
    oneof proportion {
      double fraction = 1;
      int64 size = 2;
    }
    sarus_data_spec.Scalar seed = 3;
  }

  message PrivacyUnitTrackingPaths {
  }

  message PublicPaths {
  }

  message GetItem {
    sarus_data_spec.Path path = 1;
  }

  message AssignBudget {
  }

  message AutomaticBudget {
  }

  message AttributesBudget {
  }

  message SDBudget {
  }

  message DeriveSeed {
    int64 random_integer = 1;
  }

  message GroupByPE {
  }

  message SamplingRatios {
  }

  message RelationshipSpec {
  }

  message SelectSql {
    oneof select {
      string query = 1;
      AliasedQueries aliased_queries = 2;
    }
    SQLDialect sql_dialect = 3;
    OpIdentifier op_identifier = 4;
    
    message OpIdentifier {
      SqlOp sql_op = 1;
      enum SqlOp {
        NONE = 0;
        PUP = 1;
        DP = 2;
      }
    }
  }

  enum SQLDialect {
    NONE = 0;
    POSTGRES = 1;
    SQL_SERVER = 2;
    MY_SQL = 3;
    SQLLITE = 4;
    ORACLE = 5;
    BIG_QUERY = 6;
    REDSHIFT = 7;
    HIVE = 8;
  }

  message AliasedQueries {
    repeated AliasedQuery aliased_query = 1;
  }

  message AliasedQuery {
    sarus_data_spec.Path path = 1;
    string query = 2;
  }

  message Extract {
    int32 size = 1;
  }

  message ValidatedUserType {
  }

  message ErrorEstimation {
  }

  message FitModel {
    int32 batch_size = 1;
    int32 epochs = 2;
    TextKind text_kind = 3;
    bool quantize=4;
    bool use_lora=5;
    double learning_rate=6;
    repeated string lora_attn_modules=7;
    bool apply_lora_to_mlp=8;
    bool apply_lora_to_output=9;
    int32 lora_rank=10;
    int32 lora_alpha=11;
  }

  message TextKind {
    oneof text_kind {
      string text_field = 1;
      Chat chat = 2;}
  }

  message Chat {
    string question_field = 1;
    string answer_field = 2;
  }
  message FitModelDP {
    int32 batch_size = 1;
    int32 epochs = 3;
    double l2_norm_clip = 4;
    TextKind text_kind = 5;
    bool quantize=6;
    bool use_lora=7;
    double learning_rate=8;
    repeated string lora_attn_modules=9;
    bool apply_lora_to_mlp=10;
    bool apply_lora_to_output=11;
    int32 lora_rank=12;
    int32 lora_alpha=13;
  }

  message GenerateFromModel {
    int32 max_new_tokens = 1;
    double temperature = 2;
  }

  message SelectTable {
    sarus_data_spec.Path path = 1;
  }
}