config.yaml

separate_transform_stage: true
# input keys should be among Attributes, EventGroups, Conversions, Views, GeoEvents POIVisit, OtherEvents, Impressions, Clicks, Engagements
partition_keys:
  - tenant_id
  - entity_type
  - entity_domain
  - event_date
  - event_hour

inputs:
  - name: geo_events
    locations:
      - folder: 
    key:
      - entity_id
      #- advertiser_id

transform:
  - python:
      timeout_in_sec: 40
      file: '{{env "USER_DIR"}}/transform.py'

outputs:
  - destination: 
    drop_partitions:
    - shard
    
split:
  #task_size_bytes: 268435456
  bucket_size_bytes: 268435456
  # in Prod make it to 64
  #num_buckets:
  #- 8
sort:
  num_partition_keys: 1
  key_names:
    - batch_id
    - entity_id
    #- advertiser_id

  project:
    '.':
      field_names: [ entity_id ] #, advertiser_id ]
  key:
    - template:  'test_1' #replace with the env BATCH_ID
    - template: '{{"{{index . 0}}"}}'
    #- template: '{{"{{index . 1}}"}}'

is_output_sharded: true
capacity:
  split_stage:
    in_proc_executor:
      num_workers: 20
  collate_stage:
    in_proc_executor:
      num_workers: 20
  sort_stage:
    in_proc_executor:
      num_workers: 4
  merge_plan_stage:
    in_proc_executor:
      num_workers: 20
  merge_do_stage:
    in_proc_executor:
      num_workers: 20
  transform_stage:
    in_proc_executor:
      num_workers: 40
  • Table of contents

Was this article helpful?