config.yaml

separate_transform_stage: true
partition_keys:
  - tenant_id
  - entity_type
  - entity_domain
  - event_date

inputs:
  - name: events
    locations:
      - folder: 
    key:
      - entity_id

transform:
  - python:
      file: '{{env "USER_DIR"}}/transform.py'
      
outputs:
  # event date is within lookback window - avro
  - destination: 
    project:
      '.':
        omit_if: beyond_lookback
        field_names: [ entity_id, event_window_start, event_window_duration, has_impressions, has_clicks, has_engagements, has_conversions, has_views, has_interactions, has_geo_events, has_poi_visits, has_other_events, attributes, event_groups, conversions, views, geo_events, poi_visits, other_event, impressions]
      
  # event date is within lookback window - parquet     
  - destination: 
    format: parquet
    project:
      '.':
        omit_if: beyond_lookback
        field_names: [ entity_id, event_window_start, event_window_duration, has_impressions, has_clicks, has_engagements, has_conversions, has_views, has_interactions, has_geo_events, has_poi_visits, has_other_events, attributes, event_groups, conversions, views, geo_events, poi_visits, other_event, impressions]
    
  # event date is beyond lookback window - parquet     
  - destination: 
    format: parquet
    project:
      '.':
        omit_unless: beyond_lookback
        field_names: [ batch_id, entity_id, event_window_start, event_window_duration, has_impressions, has_clicks, has_engagements, has_conversions, has_views, has_interactions, has_geo_events, has_poi_visits, has_other_events, attributes, event_groups, conversions, views, geo_events, poi_visits, other_event, impressions]
    split_by:
      key:
      - batch_id
      strip_key: true

split:
  bucket_size_bytes: 2147483648 # 2 GB
sort:
  num_partition_keys: 0
  key_names:
    - entity_id
  project:
    '.':
      field_names: [ entity_id ]
capacity:
  split_stage:
    in_proc_executor:
      num_workers: 12
  collate_stage:
    in_proc_executor:
      num_workers: 32
  sort_stage:
    in_proc_executor:
      num_workers: 4
  merge_plan_stage:
    in_proc_executor:
      num_workers: 30
  merge_do_stage:
    in_proc_executor:
      num_workers: 100
  transform_stage:
    in_proc_executor:
      num_workers: 40
  • Table of contents

Was this article helpful?