config.yaml

separate_transform_stage: true
partition_keys:
  - tenant_id
  - entity_type
  - entity_domain
  - event_date
  
inputs:
  - name: events
    locations:
      - folder: 
    key:
      - entity_id

transform:
  - python:
      timeout_in_sec: 30
      file: '{{env "USER_DIR"}}/transform.py'
      
outputs:   
  - destination: 
    format: parquet
    drop_partitions:
    - shard
    project:
      '.':
        field_names: [ batch_id, entity_id, event_window_start, event_window_duration, has_impressions, has_clicks, has_engagements, has_conversions, has_views, has_interactions, has_geo_events, has_poi_visits, has_other_events, has_id_syncs, has_text_clicks, attributes, event_groups, conversions, views, geo_events, poi_visits, other_event, impressions, id_syncs, text_clicks ]
    split_by:
      key:
      - batch_id
      strip_key: true      

split:
  task_size_bytes: 268435456
  # In production make 16, 16
  num_buckets:
  - 8
  
sort:
  num_partition_keys: 0
  key_names:
    - entity_id
  project:
    '.':
      field_names: [ entity_id ]

is_output_sharded: true
capacity:
  split_stage:
    in_proc_executor:
      num_workers: 80
  collate_stage:
    in_proc_executor:
      num_workers: 32
  sort_stage:
    in_proc_executor:
      num_workers: 10
  merge_plan_stage:
    in_proc_executor:
      num_workers: 30
  merge_do_stage:
    in_proc_executor:
      num_workers: 100
  transform_stage:
    in_proc_executor:
      num_workers: 60
  • Table of contents

Was this article helpful?