config.yaml

separate_transform_stage: true
# input keys should be among Attributes, EventGroups, Conversions, Views, GeoEvents POIVisit, OtherEvents, Impressions, Clicks, Engagements
partition_keys:
  - tenant_id
  - entity_type
  - entity_domain
  - event_date
  - event_hour
 
inputs:
  - name: clicks
    locations:
      - folder: 
    key:
      - entity_id
      - advertiser_id

  - name: impressions
    locations:
      - folder: 
    key:
      - entity_id
      - advertiser_id
  
  - name: views
    locations:
      - folder: 
    key:
      - entity_id
      - advertiser_id


transform:
  - python:
      timeout_in_sec: 30
      file: '{{env "USER_DIR"}}/transform.py'
outputs:
  - destination: 
    drop_partitions:
    - shard

split:
  task_size_bytes: 268435456
  #bucket_size_bytes: 268435456
  # this will make sure split stage will have 64 buckets irrespective of data size
  num_buckets:
  - 8

sort:
  num_partition_keys: 1
  key_names:
    - batch_id
    - entity_id
    - advertiser_id
    
  project:
    '.':
      field_names: [ entity_id, advertiser_id ]
  key:
    - template:  'test_1' #replace with the env BATCH_ID
    - template: '{{"{{index . 0}}"}}'
    - template: '{{"{{index . 1}}"}}'
    
# this will completely skip merge-do and plan
is_output_sharded: true
capacity:
  split_stage:
    in_proc_executor:
      num_workers: 60
  collate_stage:
    in_proc_executor:
      num_workers: 32
  sort_stage:
    in_proc_executor:
      num_workers: 10
  merge_plan_stage:
    in_proc_executor:
      num_workers: 60
  merge_do_stage:
    in_proc_executor:
      num_workers: 60
  transform_stage:
    in_proc_executor:
      num_workers: 70
  • Table of contents

Was this article helpful?