# version the config so we can make breaking changes version: v2 # agent internal settings agent_settings: tag: prod log: level: debug secure_logging: true # If enabled, some of the sensitive information will be redacted (can affect performance of agent) # soft_cpu_limit is only honored by clustering processor at the moment. 0.5 means 50% of a core. # it can be enabled by setting cpu_friendly=true in clustering rule. soft_cpu_limit: 0.5 # When anomaly_tolerance is non-zero, anomaly scores handle edge cases better when stddev is too small. Default is 0.01. Can be set at rule level for some rule types. anomaly_tolerance: 0.1 # Anomaly scores will not be calculated for the first 1m after a source is found. Default is 30m. Can be set at rule level for some rule types. anomaly_confidence_period: 1m # Skips empty intervals when rolling so the anomaly scores are calculated based on history of non-zero intervals. Default is true. Can be set at rule level for some rule types. skip_empty_intervals: false # Only report non zero stats. Default is true. Can be set at rule level for some rule types. only_report_nonzeros: false # Anomaly capture size is used for defining logging buffer's size in terms of number of logs. Default is 100. anomaly_capture_size: 1000 # Anomaly capture byte size is used for defining logging buffer's limiting byte size. Default is empty, meaning that limiting conditions does not apply. anomaly_capture_bytesize: "10 KB" # Anomaly capture duration is used for defining logging buffer's limiting duration (for example last 10 minutes logs). Default is empty, meaning that limiting conditions does not apply. anomaly_capture_duration: 1m # Anomaly capture centered is used for flushing the logs AFTER anomaly generation and anomaly capture duration should also be set for using that future. Logs after anomaly capture duration will be flushed if it is set to true. anomaly_capture_centered: true # Anomaly coefficient is used to multiple final score to [0, 100] range. The higher the coefficient the higher the anomaly score will be. Default is 10. Can be set at rule level for some rule types. anomaly_coefficient: 10.0 # capture_flush_mode sets the behavior of flushing captured contextual log buffers. Supported modes are listed below # local_per_source: This is the default mode. captured buffer of a source is flushed when there's a local alert being triggered from same source. # local_all: This is the mode where all captured buffers are flushed when there's a local alert being triggered (not necessarily from same source). So in this mode whenever an alert is triggered from agent all capture buffers from all active sources will be flushed # tag_per_source: This is the mode where captured buffer of a source is flushed when there's an alert from same source and tag (from any agent within current tag) # tag_all: This is the mode where all captured buffers on all agents within the same tag is flushed whenever any of the agents trigger an alert capture_flush_mode: tag_per_source # ephemeral indicates that this agent's corresponding tailers can be temporarily down (for example due to scale down scenario) and it will be used for agent down scenarios ephemeral: true archive: # archives all the logs to an s3 bucket with proper date/time folder structure. name: s3 aws_key_id: '{{ Env "TEST_AWS_KEY_ID" }}' aws_sec_key: "awssecret123" s3_bucket: "testbucket" s3_region: "us-east-2" size: 16MB compress: gzip buffer: # setting buffer's path will enable file buffering to reduce agent's memory usage. path: "/var/log/edgedelta/archive" # persisting_cursor_settings defines persisting cursor locations which is suitable for environments that doesn't want to miss any data during agent restart process # path is the folder place where we create our cursor file and flush_interval is the interval that we will save to this file from memory. persisting_cursor_settings: path: /var/edgedelta/pos file_name: cursor_file.json flush_interval: 1m # attributes key defines user-defined key-value pairs that will be used to designate running agent from others # these key/value pairs are attached to the data collected/generated by the agent and sent to streaming destinations # currently, "environment", "app" and "region" keywords are supported attributes: environment: prod app: smp region: us-west # multiline_max_size defines the multiline buffer size in length. Increase this maximum line number for overflow cases. # in overflow cases all buffered lines dumped as single line, so for these environments it is better to increase this value # integer type multiline_max_size: 250 # multiline_max_bytesize defines the multiline buffer size in bytes. Increase this maximum byte limit for overflow cases. # in overflow cases all buffered lines dumped as single line, so for these environments it is better to increase this value # datasize.Size type multiline_max_bytesize: "10 KB" imports: - name: "Test_Team" conf_id: "XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX" prefix: test_team params: ElasticIndexName: TEST_INDEX # all inputs are defined in this section # each one defines a comma separated list of labels inputs: system_stats: labels: "system" container_stats: labels: "docker" agent_stats: labels: "agent" # Agent component health can be defined in only one workflow. The destinations which have "health" feature turned on will receive agent health data as logs" agent_components_health: labels: "agent-components-health" # Agent heartbeat can be defined in only one workflow. The destinations which have "heartbeat" feature turned on will receive agent heartbeat metrics agent_heartbeat: labels: "agent-heartbeat" kubernetes_stats: labels: "kubernetes-stats" # to be able to use k8s_events set env var ED_LEADER_ELECTION_ENABLED="1" in agent deployment manifest. k8s_events: labels: "k8s-events" process_stats: groups: - labels: "infa-processes" include: users: - infa - pcuser cmds: - org.apache.catalina.startup.(Bootstrap) - /tomcat/temp/([^/\s]+) - /home/pcuser/Informatica/9.1.0/server/bin/([^/\s]+) - (AdminConsole) exclude: users: - root cmds: - pmserver containers: - labels: "nginx,errorcheck" include: - "image=gitlab/nginx:latest" - labels: "billing,billing-ui,errorcheck" include: - "image=billing-dashboard:.*" enable_incoming_line_anomalies: true - labels: "errorcheck" include: - "image=.*" # Detects line patterns automatically based on the Ragel FSM Based Lexical Recognition process. No need to specify line_pattern explicitly. auto_detect_line_pattern: true - labels: "apache-web" include: - "name=apache*,image=.*latest$" exclude: - "image=.*nginx.*" - "name=.*nginx.*" # If "line_pattern" regex rule is specified in the agent config, then agent process lines not for New Line("\n") but for this specific line separation rule. line_pattern: '^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}' kubernetes: - labels: "nginx,errorcheck" include: - "pod=^nginx.*$,kind=ReplicaSet,namespace=default,container-name=nginx,container-image=docker.io/nginx:latest,labels.my_app=abc" exclude: # exclude has higher priority - "namespace=^kube-system$" - labels: "apache,errorcheck" include: - "pod=^apache.*$,namespace=.*web*" exclude: # exclude has higher priority - "namespace=^kube-nginx$" - "pod=.*nginx*,kind=StatefulSet" - labels: "k8s with enrichment dynamic fields from labels" include: - "pod=flog,namespace=default" enrichments: dynamic: field_mappings: - field_name: "service" value: '{{".labels.service"}}' # On Azure AKS clusters, starting from k8s version 1.19 (or somewhere around that) the default log format was switched to text # because K8s switched from docker to containerd as container runtime daemon. files: - labels: "billing,errorcheck" path: "/billing/logfolder1/*.log" - labels: "billing,errorcheck" path: "/etc/systemd/system/billingservice/*.log" # Detects line patterns automatically based on the Ragel FSM Based Lexical Recognition process. No need to specify line_pattern explicitly. auto_detect_line_pattern: true # stack trace detector only runs in auto line detection mode boost_stacktrace_detection: true enable_persisting_cursor: true # source filters apply at the source pipe level. # All contextual logs and archive logs will be subject to these filters # all workflow and rule pipes for this source will get the logs after these filters are applied filters: - info - not_trace - mask_card - mask_password - labels: "docker,my_container" path: "/var/lib/docker/my_container/*.log" # If you collect the docker container standard output logs on a file with "JSON File logging driver", you need define and enable docker_mode. docker_mode: true - labels: "app,service_a" path: "/var/log/service_a.log" # If "line_pattern" regex rule is specified in the agent config, then agent process lines not for New Line("\n") but for this specific line separation rule. line_pattern: "^MMM dd, yyyy hh:mm:ss" # option for late log handling in terms of rule metrics (alerting and rule metric .error/.avg/.max/.min create) and cluster patterns/samples # ignore rule metrics/alerting for logs that have timestamp older than 15m from now, report cluster patterns with logs' original timestamp instead of time.Now() late_arrival_handling: rule_metrics: ignore_after: 15m patterns: ignore_after: 4h report_with_original_timestamp: true source_detection: source_type: "Docker" optional: false field_mappings: docker_container_id: "docker.id" docker_container_image: "docker.image" - labels: k8s_log path: /var/logs/anyDir/MyApp/users/MyPodID/transaction.log enrichments: # from_path is used to enrich data with fields extracted from path. from_path: field_mappings: - field_name: application # pattern must be a capture pattern and only for one capture group. pattern: /var/logs/anyDir/(?:(.+)/)?users/.* # from_k8s is used to enrich data with k8s attributes when the data is being streamed. # currently, "pod, namespace and node" can be used. from_k8s: pod_identifier_pattern: /var/logs/anyDir/MyApp/users/(?:(.+)/)/.* field_mappings: - field_name: instance_id pod_attribute: pod transformers: # replace all "source" matches with "target" - source: "-" target: "_" type: "replace" # remove all "test" words - source: "test*" target: "" type: "regex" - field_name: namespace pod_attribute: namespace # fields from labels should have pod_attribute start with "labels." - field_name: service pod_attribute: labels.service # dynamicField from populated from other fields. value should be in text template format. # The fields in template must be defined in from_path or from_k8s. dynamic: field_mappings: - field_name: tag value: "tail.{{.application}}.{{.service}}" # dynamic fields can be derived from other dynamic fields. # Dependent fields should be ordered in dependency order. If field2 is dependent to field1 then field1 defined before field2. - field_name: version value: "v.0.1.13.{{.tag}}" # You can define static fields and dynamic fields can be derived from static fields. - field_name: static_field value: "static_value" - field_name: derived_from_static_field value: "derived_from_static.{{.static_field}}" # from_logs is used to enrich data with fields extracted from logs from_logs: field_mappings: - field_name: podname # pattern must be a capture pattern and only for one capture group. pattern: "podname: (\\w+)" - field_name: component # extracting using json_path is also supported json_path: fields.component # all the files in this globalPath are going to be processed, clustered, populated metrics and streamed individually. - labels: "billing,errorcheck" path: "/billing/logfolder1/*.log" separate_source: true # with this flag all files are working individually. # add ingestion_time to JSON logs - labels: "billing,errorcheck" path: "/billing/logfolder1/*.log" # ingest timestamp if input is JSON format. add_ingestion_time: true skip_ingestion_time_on_failure: true # skip ingestion time when the input is broken or invalid format. winevents: - channel: "Application" labels: "errorcheck" - channel: "Security" labels: "errorcheck" - channel: "System" labels: "errorcheck" - channel: "Setup" labels: "errorcheck" ports: - protocol: tcp port: 514 labels: "syslog,firewall" line_pattern: '^\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}' - protocol: tcp port: 24680 labels: "errorcheck" - protocol: udp port: 13579 labels: "errorcheck" - protocol: tcp port: 8080 labels: "syslog,tls,service_a" tls: crt_file: /certs/server-cert.pem key_file: /certs/server-key.pem ca_file: /certs/ca.pem # exec inputs run at specified intervals # if command fails (non-zero exit code) its output will be ignored # on successful executions each line of stdout will be ingested to the system separately. execs: - name: "processes" labels: "top" command: "top" interval: 3m - name: "welcomes" labels: "script" interval: 10s command: "/bin/sh -c" script: | for i in {1..50} do echo "Welcome $i times" done # ed specific http collector. agents knows the protocol. # in a typical deployment heavy agent tail the metrics from this http endpoint where node level agents push to it using EDHeavy destination # ed_ports inputs (tcp/http) are used for heavy agent or raw message tailing. # schema: Only FlattenedObservation is supported now. If schema is not provided then it is assumed as text lines. # read_size: Read size is only applicable when otherwise it is not used. # If read_size is not provided for the case then it will be assumed as 1. # read_timeout: Timeout duration for reading from tcp port. It is only applicable for protocol=tcp. ed_ports: - labels: "error-counts-per-node" port: 4545 protocol: http schema: FlattenedObservation - labels: "errorcheck" port: 9000 protocol: tcp read_size: 10000 read_timeout: 30s source_detection: source_type: "K8s" # Possible source types: "Docker", "ECS", "File" and "K8s" # When optional is set to "true", if acquired log is not processed properly, this log will be not ignored (default behavior) and will be ingested with original # (in this case, port) source definition optional: true # Field mappings define how we can extract certain information from obtained input # Key is the label that we decorate the log with respect to given source type. We will match the given key with our internal # representation for this decoration purpose. Value corresponds to the JSON path that one can find the field related to the key. # Possible mappings for each source type: # Docker: # - docker_container_id, docker_container_image (Mandatory) # - docker_container_name, docker_image_name (Optional) (Not defining them can cause loss of information) # ECS: # - ecs_container_id, ecs_container_image, ecs_container_name (Mandatory) # - ecs_cluster, ecs_container, ecs_task_family, ecs_task_version (Optional) (Not defining them can cause loss of information) # File: # - file_path, file_glob_path (Mandatory) # K8s: # - k8s_namespace, k8s_pod_name, k8s_container_name, k8s_container_image (Mandatory) # - k8s_logfile_path, k8s_controller_kind, k8s_controller_name, k8s_controller_logical_name, k8s_pod_id, k8s_docker_id (Optional) (Not defining them can cause loss of information) field_mappings: k8s_namespace: "kubernetes.namespace" k8s_pod_name: "kubernetes.pod.name" k8s_container_name: "kubernetes.container.name" k8s_container_image: "kubernetes.container.image" - labels: "ed-port-with-auto-detect-line-pattern" port: 5656 protocol: tcp # Detects line patterns automatically based on the Ragel FSM Based Lexical Recognition process. No need to specify line_pattern explicitly. auto_detect_line_pattern: true - labels: "ed-port-with-given-line-pattern" port: 9091 protocol: http # If "line_pattern" regex rule is specified in the agent config, then agent process lines not for New Line("\n") but for this specific line separation rule. line_pattern: '^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}' # option for late log handling in terms of rule metrics (alerting and rule metric .error/.avg/.max/.min create) and cluster patterns/samples # ignore rule metrics/alerting for logs that have timestamp older than 15m from now, report cluster patterns with logs' original timestamp instead of time.Now() late_arrival_handling: rule_metrics: ignore_after: 10m patterns: ignore_after: 3h report_with_original_timestamp: true - labels: "ed-port-with-network-interface" port: 4545 protocol: tcp # Listen is for network interface and default value is "0.0.0.0". listen: 127.0.0,1 - labels: "ed-port-tcp-with-tls" port: 4545 protocol: tcp tls: crt_file: /certs/server-cert.pem key_file: /certs/server-key.pem ca_file: /certs/ca.pem - labels: "ed-port-https-with-tls" protocol: https listen: localhost port: 443 tls: crt_file: /certs/server-cert.pem key_file: /certs/server-key.pem ca_file: /certs/ca.pem - labels: "ed-port-with-add-ingestion-time" port: 4545 protocol: tcp # ingest timestamp if input is JSON format. add_ingestion_time: true skip_ingestion_time_on_failure: true # skip ingestion time when the input is broken or invalid format. # demo inputs generates fake data with given intervals # 1ms speed generates roughly 1000 log lines per second # 100ms error interval kicks in burst of errors by the given error count demos: - name: "fast_demo" labels: "fast" speed: "1ms" error_interval: "100ms" error_count: 20 - name: "slow_demo" labels: "slow" speed: "2h" error_interval: "2m" error_count: 30 ecs: - labels: "errorcheck" include: - "container-name=myecho,task-family=test-task,task-version=2" exclude: - "container-name=.*xray.*" eventhubs: - labels: "errorcheck" # Eventhub connection string can be retrieved from Azure portal: https://docs.microsoft.com/en-us/azure/event-hubs/event-hubs-get-connection-string connection_string: "Endpoint=sb://edgedelta.servicebus.windows.net/;EntityPath=insights;SharedAccessKeyName=***;SharedAccessKey==***" # Eventhub consumer group. consumer_group: "$Default" # If partition ids are provided then only those partitions are consumed. # Otherwise Event Processor Host approach is used which for loadbalancing partitions between consumers # Checkpoint directory is used to persist last read sequence numbers. partition_ids: "0,1,2,3" checkpoint_dir: "/var/eventhub-checkpoint/" # Storage account is used only when partition_ids is empty. In this mode azure storage is used as lease & checkpoint backend. # Multiple agents communicate via blob ojects to loadbalance the eventhub partitions evenly and keep track of last retrieved event offset # storage_account_name: "mystorageacc" # storage_account_key: "*****" # storage_container_name: "edgedelta-eventhub-container-for-test" kafkas: - labels: "errorcheck" # Kafka endpoints are comma separated urls list for brokers endpoint: "something" # Kafka topic to listen to. topic: "topic" - labels: "errorcheck-tls" endpoint: "something-tls" topic: "topic" tls: disable_verify: true ca_file: /var/etc/kafka/ca_file ca_path: /var/etc/kafka crt_file: /var/etc/kafka/crt_file key_file: /var/etc/kafka/keyfile key_password: p@ssword123 client_auth_type: noclientcert # possible selections: noclientcert, requestclientcert, requireanyclientcert, verifyclientcertifgiven, requireandverifyclientcert min_version: TLSv1_1 max_version: TLSv1_3 - labels: "my-kafka-events" endpoint: "something" topic: "topic" # Consumer group to isolate the consumption of topic for the agents. All agents sharing same config will be joining same consumer group. group_id: "my-group" sasl: username: kafka_username password: p@ssword123 mechanism: PLAIN # possible selections: PLAIN, SCRAM-SHA-256, SCRAM-SHA-512 source_detection: source_type: "Custom" optional: false field_mappings: namespace: "kubernetes.namespace" serviceName: "service" roleName: "user.role" systemType: "system" s3_sqs: - labels: "errorcheck" sqs_url: "https://sqs.us-west-2.amazonaws.com/233765244907/taylan-test-sqs" # access credentials must have following permissions for following actions: # sqs:DeleteMessage, s3:GetObject, sqs:DeleteMessageBatch, sqs:ReceiveMessage access_key_id: "ABCDEFG" access_secret: "Dn2djaskl" # region where the bucket and sqs queue located region: "us-west-2" - labels: "alb" sqs_url: "https://sqs.us-west-2.amazonaws.com/233765244907/my-alb-logs-sqs" access_key_id: "ABCDEFG" access_secret: "Dn2djaskl" region: "us-west-2" # supported log types for s3: # - alb: Application load balancer logs are gzip files. Each line contains raw access log message. # - cloudtrail: Cloud trail logs are gzip files. Each file contains a json object which has multiple records inside. log_type: alb - labels: "sqs-inout-assumes-role" sqs_url: "https://sqs.us-west-2.amazonaws.com/233765244907/taylan-test-sqs" region: "us-west-2" # role_arn is used for assuming an iam role. To see how it works ref: https://docs.aws.amazon.com/IAM/latest/UserGuide/tutorial_cross-account-with-roles.html role_arn: "arn:aws:iam:::role/" # external_id increases the security of the role by requiring an optional external identifier, which prevents "confused deputy" attacks. external_id: "053cf606-8e80-47bf-b849-8cd1cc826cfc" cloudwatches: - labels: "us-west-2_ed-log-group_admin-api" # region supports regex, all regions in us. region: "^us.*$" # you should provide log group name literally, it does not support regex. log_group: /ed-log-group # log_stream supports regex expression and if it is not provided means get all log streams. log_stream: "^log.*$" # lookback is used for how long ago to monitor log events. Default is 1 hour. lookback: 1h # interval is used for polling frequency to look new coming log events. Default is 1 minute. interval: 1m # prepend_timestamp is used to add event timestamp as a prefix of event message with a tab("\t") delimiter. prepend_timestamp: true # The maximum number of log events returned. # Default the maximum is as many log events as can fit in a response size of 1 MB, up to 10,000 log events. result_limit: 5000 - labels: "us_ed-log-group_admin" region: "^us.*$" log_group: "/ed-log-group" log_stream: "^admin.*$" interval: 5m - labels: "ed-log-group" # all regions. region: ".*" log_group: "/ed-log-group" # all streams. log_stream: ".*" interval: 5m - labels: "ed-log-with-regex-group-name" # all regions. region: ".*" # log groups starting with /ed-log log_group: "^/ed-log" # all streams. log_stream: ".*" interval: 5m - labels: "cloudwatch-input-assumes-role" region: "us-west-2" log_group: "/ed-log-group" # all streams. log_stream: ".*" interval: 5m # role_arn is used for assuming an iam role. To see how it works ref: https://docs.aws.amazon.com/IAM/latest/UserGuide/tutorial_cross-account-with-roles.html role_arn: "arn:aws:iam:::role/" # external_id increases the security of the role by requiring an optional external identifier, which prevents "confused deputy" attacks. external_id: "053cf606-8e80-47bf-b849-8cd1cc826cfc" - labels: "cloudwatch-input-host-filtered" # If host is set then cloudwatch will be tailed by the agent on given host. This might be useful to avoid duplicate consumption if you have more than one agent running this config host: "myhost" # all regions. region: ".*" # log groups starting with /ed-log log_group: "^/ed-log" # all streams. log_stream: ".*" interval: 5m pubsubs: - labels: "google_pubsub_with_key" project_id: "projectID1" sub_id: "subID1" key: "{\"pubsub_key\": \"key_123\"}" - labels: "google_pubsub_with_key_path" project_id: "projectID2" sub_id: "subID2" key_path: "pubsub_key_path" # all outputs are defined in this section. # each one must have a unique name within outputs section. outputs: streams: # statistics calculated by edgedelta agents can be sent to streaming endpoints - name: '{{ Env "TEST_SUMO" "sumo-us" }}' type: sumologic endpoint: "https://endpoint4.collection.us2.sumologic.com/receiver/v1/http/XYZ" custom_tags: "app": "transaction_manager" "region": "us-west-2" "File Path": "{{.FileGlobPath}}" "K8s PodName": "{{.K8sPodName}}" "K8s Namespace": "{{.K8sNamespace}}" "K8s ControllerKind": "{{.K8sControllerKind}}" "K8s ContainerName": "{{.K8sContainerName}}" "K8s ContainerImage": "{{.K8sContainerImage}}" "K8s ControllerLogicalName": "{{.K8sControllerLogicalName}}" "ECSCluster": "{{.ECSCluster}}" "ECSContainerName": "{{.ECSContainerName}}" "ECSTaskVersion": "{{.ECSTaskVersion}}" "ECSTaskFamily": "{{.ECSTaskFamily}}" "DockerContainerName": "{{.DockerContainerName}}" "ConfigID": "{{.ConfigID}}" "Host": "{{.Host}}" "Source": "{{.Source}}" "SourceType": "{{.SourceType}}" "Tag": "{{.Tag}}" - name: sumo-us-2 type: sumologic endpoint: '{{ Env "EMPTY" "https://endpoint4.collection.us2.sumologic.com/receiver/v1/http/XYZ" }}' - name: datadog-default type: datadog api_key: '{{ Env "TEST_DD_APIKEY" }}' custom_tags: "app": "transaction_manager" "region": "us-west-2" "File Path": "{{.FileGlobPath}}" "K8s PodName": "{{.K8sPodName}}" "K8s Namespace": "{{.K8sNamespace}}" "K8s ControllerKind": "{{.K8sControllerKind}}" "K8s ContainerName": "{{.K8sContainerName}}" "K8s ContainerImage": "{{.K8sContainerImage}}" "K8s ControllerLogicalName": "{{.K8sControllerLogicalName}}" "ECSCluster": "{{.ECSCluster}}" "ECSContainerName": "{{.ECSContainerName}}" "ECSTaskVersion": "{{.ECSTaskVersion}}" "ECSTaskFamily": "{{.ECSTaskFamily}}" "DockerContainerName": "{{.DockerContainerName}}" "ConfigID": "{{.ConfigID}}" "Host": "{{.Host}}" "Source": "{{.Source}}" "SourceType": "{{.SourceType}}" "Tag": "{{.Tag}}" - name: datadog-custom type: datadog # If provided, custom installation of datadog log host can be reached. log_host: "" # If provided, custom installation of datadog metric host can be reached. metric_host: "" # If provided, custom installation of datadog event host can be reached. event_host: "" api_key: '{{ Env "TEST_DD_APIKEY" }}' features: metric custom_tags: "app": "starbucks_pos_transaction_manager" "region": "us-west-2" - name: datadog-alert-as-log type: datadog api_key: '{{ Env "TEST_DD_APIKEY" }}' features: metric, alert, edac alert_as_log: true # this indicates the alert will be sent as a log instead of event by default - name: newrelic type: newrelic log_host: "" metric_host: "" event_host: "" api_key: '{{ Env "TEST_NR_APIKEY" }}' account_name: '{{ Env "TEST_NR_ACCOUNT_ID" }}' features: metric,alert alert_as_log: true alert_as_event: true custom_tags: "app": "starbucks_pos_transaction_manager" "region": "us-west-2" "File Path": "{{.FileGlobPath}}" "K8s PodName": "{{.K8sPodName}}" "K8s Namespace": "{{.K8sNamespace}}" "K8s ControllerKind": "{{.K8sControllerKind}}" "K8s ContainerName": "{{.K8sContainerName}}" "K8s ContainerImage": "{{.K8sContainerImage}}" "K8s ControllerLogicalName": "{{.K8sControllerLogicalName}}" "ECSCluster": "{{.ECSCluster}}" "ECSContainerName": "{{.ECSContainerName}}" "ECSTaskVersion": "{{.ECSTaskVersion}}" "ECSTaskFamily": "{{.ECSTaskFamily}}" "DockerContainerName": "{{.DockerContainerName}}" "ConfigID": "{{.ConfigID}}" "Host": "{{.Host}}" "Source": "{{.Source}}" "SourceType": "{{.SourceType}}" "Tag": "{{.Tag}}" - name: honeycomb type: honeycomb host: "" #Host is the optional and default is "api.honeycomb.io" api_key: '{{ Env "TEST_HC_APIKEY" }}' dataset_name: "" unpacking: false features: metric,log,edac custom_tags: "app": "starbucks_pos_transaction_manager" "region": "us-west-2" "File Path": "{{.FileGlobPath}}" "K8s PodName": "{{.K8sPodName}}" "K8s Namespace": "{{.K8sNamespace}}" "K8s ControllerKind": "{{.K8sControllerKind}}" "K8s ContainerName": "{{.K8sContainerName}}" "K8s ContainerImage": "{{.K8sContainerImage}}" "K8s ControllerLogicalName": "{{.K8sControllerLogicalName}}" "ECSCluster": "{{.ECSCluster}}" "ECSContainerName": "{{.ECSContainerName}}" "ECSTaskVersion": "{{.ECSTaskVersion}}" "ECSTaskFamily": "{{.ECSTaskFamily}}" "DockerContainerName": "{{.DockerContainerName}}" "ConfigID": "{{.ConfigID}}" "Host": "{{.Host}}" "Source": "{{.Source}}" "SourceType": "{{.SourceType}}" "Tag": "{{.Tag}}" - name: appdynamics type: appdynamics host: "" #Host is the optional and default is "analytics.appdynamics.com" global_account_name: "global123" api_key: "12345" schema_name: "" features: metric,log,edac custom_tags: "app": "starbucks_pos_transaction_manager" "region": "us-west-2" "File Path": "{{.FileGlobPath}}" "K8s PodName": "{{.K8sPodName}}" "K8s Namespace": "{{.K8sNamespace}}" "K8s ControllerKind": "{{.K8sControllerKind}}" "K8s ContainerName": "{{.K8sContainerName}}" "K8s ContainerImage": "{{.K8sContainerImage}}" "K8s ControllerLogicalName": "{{.K8sControllerLogicalName}}" "ECSCluster": "{{.ECSCluster}}" "ECSContainerName": "{{.ECSContainerName}}" "ECSTaskVersion": "{{.ECSTaskVersion}}" "ECSTaskFamily": "{{.ECSTaskFamily}}" "DockerContainerName": "{{.DockerContainerName}}" "ConfigID": "{{.ConfigID}}" "Host": "{{.Host}}" "Source": "{{.Source}}" "SourceType": "{{.SourceType}}" "Tag": "{{.Tag}}" - name: elastic-cloud type: elastic index: "index name" cloud_id: "" token: "" # features is a comma separated string that contains the list of datasets which are enabled for this streaming destination. # all streaming destinations support features field but not all of them support full list of datasets. e.g. some destinations only support metrics # possible dataset types: metric (all kind of metrics), cluster (patterns and samples), context (contextual logs), log (forwarding all logs) # if features is unset default will be "metric,cluster,context" features: metric,cluster,context - name: elastic-local type: elastic index: "index name" user: elastic password: '{{ Env "TEST_ELASTIC_PWD" }}' address: - elasticnode1 - name: elastic-send-as-is type: elastic index: "index name" user: elastic password: '{{ Env "TEST_ELASTIC_PWD" }}' address: - elasticnode1 features: edac # send_as_is indicates the string typed data (edac, log) should be send as is without wrapping with a json object. send_as_is: true # edac_enrichment defines which edac related fields should be added to the original JSON payload just before pushing. # This only applies to elastic streams and when send_as_is=true. edac_enrichment: edac_id_field: "edac_id" # final JSON will have an additional "edac_id" field whose value will be the EDAC Id metric_name_field: "name" # final JSON will have an additional "name" field (or will be overriden if name exists) whose value is metric name - name: elastic-opensearch-with-rolearn type: elastic index: "index name" # region is used for AWS ElasticSearch (OpenSearch) region: "us-west-2" # role_arn is used if assuming an aws iam role. role_arn: "arn:aws:iam:::role/" # external_id is optional, it is a unique identifier to avoid confused deputy attack. # ref: https://docs.aws.amazon.com/IAM/latest/UserGuide/confused-deputy.html external_id: "external_id" address: - opensearch_domain_endpoint custom_tags: "app": "test" "region": "us-west-2" "File Path": "{{.FileGlobPath}}" "K8s PodName": "{{.K8sPodName}}" "K8s Namespace": "{{.K8sNamespace}}" "K8s ControllerKind": "{{.K8sControllerKind}}" "K8s ContainerName": "{{.K8sContainerName}}" "K8s ContainerImage": "{{.K8sContainerImage}}" "K8s ControllerLogicalName": "{{.K8sControllerLogicalName}}" "ECSCluster": "{{.ECSCluster}}" "ECSContainerName": "{{.ECSContainerName}}" "ECSTaskVersion": "{{.ECSTaskVersion}}" "ECSTaskFamily": "{{.ECSTaskFamily}}" "DockerContainerName": "{{.DockerContainerName}}" "ConfigID": "{{.ConfigID}}" "Host": "{{.Host}}" "Source": "{{.Source}}" "SourceType": "{{.SourceType}}" "Tag": "{{.Tag}}" - name: elastic-opensearch type: elastic index: "index name" region: "us-west-2" address: - opensearch_domain_endpoint - name: edport-with-endpoint type: edport endpoint: "http://localhost:4545/v0/collect" - name: edport-with-endpoint-and-schema type: edport endpoint: "http://localhost:4545/v0/collect" schema: FlattenedObservation - name: edport-with-tcp type: edport host: "log-repo-host" port: 23131 pool_size: 1 - name: edport-with-tcp-and-schema type: edport host: "log-repo-host" port: 23131 schema: FlattenedObservation - name: cloudwatch type: cloudwatch region: us-west-2 log_group_name: test_log_group log_stream_name: test_log_stream # monitored container can override the default values of log group name, logs stream name and log stream prefix, by setting ed_log_group_name, ed_log_stream_name, ed_log_stream_prefix labels # Default value is false. allow_label_override: true # Default value is false. auto_create: true - name: cloudwatch-prefix type: cloudwatch region: us-west-2 log_group_name: test_log_group # CloudWatch log stream prefix (either name or prefix is supported not both) log_stream_prefix: ecs # monitored container can override the default values of log group name, logs stream name and log stream prefix, by setting ed_log_group_name, ed_log_stream_name, ed_log_stream_prefix labels features: log - name: cloudwatch-auto type: cloudwatch region: us-west-2 # only supported for ECS environments, and when provided only region configuration can be provided. Automatically create; # LogGroupName in the format of /ecs/task_definition_family # LogsStreamPrefix in the format of ecs/container_name/task_id # Default value is false. auto_configure: true auto_create: true features: log - name: cloudwatch-assumes-role type: cloudwatch region: us-west-2 log_group_name: test_log_group log_stream_name: test_log_stream # role_arn is used for assuming an iam role. To see how it works ref: https://docs.aws.amazon.com/IAM/latest/UserGuide/tutorial_cross-account-with-roles.html role_arn: "arn:aws:iam:::role/" # external_id increases the security of the role by requiring an optional external identifier, which prevents "confused deputy" attacks. external_id: "053cf606-8e80-47bf-b849-8cd1cc826cfc" - name: app-insights type: azure # application insight connection string can be grabbed from azure portal which contains both endpoint and instrumentation key endpoint: https://dc.services.visualstudio.com/v2/track # api_key corresponds to app insight's instrumentation key api_key: '{{ Env "TEST_AZURE_APIKEY" }}' - name: signalfx type: signalfx endpoint: https://ingest.us1.signalfx.com/v2 token: '{{ Env "TEST_SIGNALFX_TOKEN" }}' features: metric - name: humio type: humio endpoint: http://localhost:8080 token: '{{ Env "TEST_HUMIO_TOKEN" }}' features: log - name: loggly type: loggly endpoint: https://logs-01.loggly.com token: token12345 features: log grouped_events: true # it will enable grouping event feature for loggly, meaning that one payload per observation group will be generated. "events" field will be used instead of "event". - name: graylog type: graylog host: "localhost" port: 5555 features: metric,log,health,alert,event custom_tags: "ConfigID": "{{.ConfigID}}" "Host": "{{.Host}}" "Source": "{{.Source}}" "SourceType": "{{.SourceType}}" "Tag": "{{.Tag}}" - name: kafka type: kafka endpoint: localhost:2888,localhost:3888 # brokers topic: example_kafka_topic required_acks: 10 batch_size: 1000 batch_bytes: 10000 batch_timeout: 1m async: true features: log,metric tls: disable_verify: true ca_file: /var/etc/kafka/ca_file ca_path: /var/etc/kafka crt_file: /var/etc/kafka/crt_file key_file: /var/etc/kafka/keyfile key_password: p@ssword123 client_auth_type: noclientcert # possible selections: noclientcert, requestclientcert, requireanyclientcert, verifyclientcertifgiven, requireandverifyclientcert sasl: username: kafka_username password: p@ssword123 mechanism: PLAIN # possible selections: PLAIN, SCRAM-SHA-256, SCRAM-SHA-512 - name: influxdb-integration type: influxdb endpoint: "https://influxdb..com/" token: YOUR_API_TOKEN # empty version or version 2.x requires bucket and organization info bucket: testbucket organization: yourorganization port: 443 - name: influxdb-integration-v1x type: influxdb version: 1.x endpoint: "https://influxdb..com/" token: YOUR_API_TOKEN port: 443 http_user: admin http_password: your_http_password # version 1.x requires db info db: "specific_influxdb_database" - name: scalyr-integration type: scalyr endpoint: "https://app.scalyr.com/api/uploadLogs?token={scalyr log access write key}" # integrations can be referred in the config via integration_name. # if no name override is provided then integration_name value should be used in the workflow destinations. # rest of the fields are overridable - integration_name: orgs-splunk name: edac-splunk-dest features: edac index: edac-index - integration_name: orgs-splunk name: metric-splunk-dest features: metric index: metric-index - name: my-splunk type: splunk endpoint: "://:/" token: "32-character GUID token" - name: wavefront-integration type: wavefront endpoint: "https://{your wavefront domain}.wavefront.com/report" token: "" - name: logzio type: logzio endpoint: "https://app-eu.logz.io:8071" token: "" metric_token: "" custom_tags: "app": "starbucks_pos_transaction_manager" "region": "us-west-2" "File Path": "{{.FileGlobPath}}" "K8s PodName": "{{.K8sPodName}}" "K8s Namespace": "{{.K8sNamespace}}" "K8s ControllerKind": "{{.K8sControllerKind}}" "K8s ContainerName": "{{.K8sContainerName}}" "K8s ContainerImage": "{{.K8sContainerImage}}" "K8s ControllerLogicalName": "{{.K8sControllerLogicalName}}" "ECSCluster": "{{.ECSCluster}}" "ECSContainerName": "{{.ECSContainerName}}" "ECSTaskVersion": "{{.ECSTaskVersion}}" "ECSTaskFamily": "{{.ECSTaskFamily}}" "DockerContainerName": "{{.DockerContainerName}}" "ConfigID": "{{.ConfigID}}" "Host": "{{.Host}}" "Source": "{{.Source}}" "SourceType": "{{.SourceType}}" "Tag": "{{.Tag}}" - name: fluentd-log-fwd type: fluentd host: log-repo-host port: 23131 encoder: msgpack pool_size: 10 # tag_prefix; agent setting tag value is appended to this prefix # and used as fluentd forward tag (the payload itself will still have edgedelta_tag=agentsettings.tag) # tag_prefix is only used as fluentd tag if the corresponding data doesn't have a tag defined in enrichments tag_prefix: "tail.ed." features: log - name: loki-integration type: loki endpoint: "https://localhost:3000/loki/api/v1/push" api_key: "api_key" user: "user" custom_tags: "app": "test" "region": "us-west-2" "File Path": "{{.FileGlobPath}}" "K8s PodName": "{{.K8sPodName}}" "K8s Namespace": "{{.K8sNamespace}}" "K8s ControllerKind": "{{.K8sControllerKind}}" "K8s ContainerName": "{{.K8sContainerName}}" "K8s ContainerImage": "{{.K8sContainerImage}}" "K8s ControllerLogicalName": "{{.K8sControllerLogicalName}}" "ECSCluster": "{{.ECSCluster}}" "ECSContainerName": "{{.ECSContainerName}}" "ECSTaskVersion": "{{.ECSTaskVersion}}" "ECSTaskFamily": "{{.ECSTaskFamily}}" "DockerContainerName": "{{.DockerContainerName}}" "ConfigID": "{{.ConfigID}}" "Host": "{{.Host}}" "Source": "{{.Source}}" "SourceType": "{{.SourceType}}" "Tag": "{{.Tag}}" message_template: "File Path": "{{.FileGlobPath}}" "K8s PodName": "{{.K8sPodName}}" "K8s Namespace": "{{.K8sNamespace}}" "K8s ControllerKind": "{{.K8sControllerKind}}" "K8s ContainerName": "{{.K8sContainerName}}" "K8s ContainerImage": "{{.K8sContainerImage}}" "K8s ControllerLogicalName": "{{.K8sControllerLogicalName}}" "ECSCluster": "{{.ECSCluster}}" "ECSContainerName": "{{.ECSContainerName}}" "ECSTaskVersion": "{{.ECSTaskVersion}}" "ECSTaskFamily": "{{.ECSTaskFamily}}" "DockerContainerName": "{{.DockerContainerName}}" "ConfigID": "{{.ConfigID}}" "Host": "{{.Host}}" "Source": "{{.Source}}" "SourceType": "{{.SourceType}}" "Tag": "{{.Tag}}" - name: loki-send-alert-as-log type: loki endpoint: "https://localhost:3000/loki/api/v1/push" api_key: "api_key" user: "user" custom_tags: "app": "test" "region": "us-west-2" message_template: "SourceType": "{{.SourceType}}" "Tag": "{{.Tag}}" alert_as_log: true # this indicates the alert will be sent as a log instead of event by default - name: dynatrace type: dynatrace log_endpoint: "https://{your-environment-id}.live.dynatrace.com/api/v2/logs/ingest" metric_endpoint: "https://{your-environment-id}.live.dynatrace.com/api/v2/metrics/ingest" token: "" custom_tags: "region": "us-west-2" "File Path": "{{.FileGlobPath}}" "K8s PodName": "{{.K8sPodName}}" "K8s Namespace": "{{.K8sNamespace}}" "K8s ControllerKind": "{{.K8sControllerKind}}" "K8s ContainerName": "{{.K8sContainerName}}" "K8s ContainerImage": "{{.K8sContainerImage}}" "K8s ControllerLogicalName": "{{.K8sControllerLogicalName}}" "ECSCluster": "{{.ECSCluster}}" "ECSContainerName": "{{.ECSContainerName}}" "ECSTaskVersion": "{{.ECSTaskVersion}}" "ECSTaskFamily": "{{.ECSTaskFamily}}" "DockerContainerName": "{{.DockerContainerName}}" "ConfigID": "{{.ConfigID}}" "Host": "{{.Host}}" "Source": "{{.Source}}" "SourceType": "{{.SourceType}}" "Tag": "{{.Tag}}" - name: eventhub-stream type: eventhubstream endpoint: "https://namespace.servicebus.windows.net/hub/messages" token: "azure-ad-token" - name: generic-edport-stream-https type: edport endpoint: https://test.com/catcher schema: FlattenedObservation pool_size: 5 features: log - name: generic-edport-stream-tcp type: edport host: log-repo-host port: 23131 pool_size: 5 features: log,edac,metric - name: cribl-http type: cribl endpoint: http://in.logstream..cribl.cloud:10080/crible/_bulk token: "" features: log,edac,metric,alert - name: my-openmetrics type: openmetrics endpoint: "http://localhost:8428/metrics" features: metric custom_tags: "app": "test" "region": "us-west-2" "File Path": "{{.FileGlobPath}}" "K8s PodName": "{{.K8sPodName}}" "K8s Namespace": "{{.K8sNamespace}}" "K8s ControllerKind": "{{.K8sControllerKind}}" "K8s ContainerName": "{{.K8sContainerName}}" "K8s ContainerImage": "{{.K8sContainerImage}}" "K8s ControllerLogicalName": "{{.K8sControllerLogicalName}}" "ECSCluster": "{{.ECSCluster}}" "ECSContainerName": "{{.ECSContainerName}}" "ECSTaskVersion": "{{.ECSTaskVersion}}" "ECSTaskFamily": "{{.ECSTaskFamily}}" "DockerContainerName": "{{.DockerContainerName}}" "ConfigID": "{{.ConfigID}}" "Host": "{{.Host}}" "Source": "{{.Source}}" "SourceType": "{{.SourceType}}" "Tag": "{{.Tag}}" - name: my-s3-streamer type: s3stream aws_key_id: '{{ Env "AWS_KEY_ID" }}' aws_sec_key: '{{ Env "AWS_SECRET_KEY" }}' bucket: testbucket region: us-east-2 flush_interval: 30s # Default is 3 minutes flush_bytesize: 1M # Having a byte size parameter means that if the given buffer reachs given threshold, it will be flushed even if flush interval is not yet elapsed - name: my-observeinc type: observeinc endpoint: "http://localhost:5555" features: metric,log,health,alert,event custom_tags: "Host": "{{.Host}}" "Source": "{{.Source}}" "SourceType": "{{.SourceType}}" "Tag": "{{.Tag}}" triggers: # anomaly captures can be sent to trigger endpoints such as slack. - name: error-anomaly-slack type: slack endpoint: "" # suppression_window can be configured to avoid duplicate alerts. It defaults to 20 minutes when unset. suppression_window: 60m # supported suppression modes: 'local' and 'global'. default mode is local. # local would suppress if there was an alert fired for same source+rule from this agent # global would suppress if there was an alert fired for source+rule from any agent that shares same tag/configid suppression_mode: global # notify_content is optional way to customize the notification content. It supports templating. # available template fields: EDAC, Source, SourceType, MetricName, Tag, Host, ConfigID, Timestamp... # important notes about templates you should read before use: # - if the value is empty the item will not be sent to slack # - the keys are sorted alphabetically before sending to slack so they will not appear in the order specified in the config notify_content: title: "Anomaly Detected: {{.ProcessorDescription}}" # disable_default_fields is used for disabling default fields in notify message. disable_default_fields: false # advanced_content provides full flexibility to defined the payload in slack notification post requests # important notes about advanced_content template should read before use; # - overides all other settings, if custom_fields, title or disable_default_fields are provided then they will be ignored. # - same set of templating fields are supported as in custom_fields # - author is responsible to make sure validity of the json # - use block kit builder tool provided by slack https://app.slack.com/block-kit-builder prior to test advanced_content: | { "blocks": [ { "type": "section", "text": { "type": "mrkdwn", "text": "*Raw POST Anomaly Detected: {{.ProcessorDescription}}*" } }, { "type": "section", "text": { "type": "mrkdwn", "text": "*MatchedTerm* {{.MatchedTerm}}\n*ConfigID* {{.ConfigID}}" } } ] } custom_fields: "Dashboard": "https://app.edgedelta.com/investigation?edac={{.EDAC}}×tamp={{.Timestamp}}" "Current Value": "{{.CurrentValue}}" "Threshold Value": "{{.ThresholdValue}}" "Custom Message": "{{.CurrentValue}} exceeds {{.ThresholdValue}}" "Built-in Threshold Description": "{{.ThresholdDescription}}" "Matched Term": "{{.MatchedTerm}}" "Threshold Type": "{{.ThresholdType}}" "File Path": "{{.FileGlobPath}}" "K8s PodName": "{{.K8sPodName}}" "K8s Namespace": "{{.K8sNamespace}}" "K8s ControllerKind": "{{.K8sControllerKind}}" "K8s ContainerName": "{{.K8sContainerName}}" "K8s ContainerImage": "{{.K8sContainerImage}}" "K8s ControllerLogicalName": "{{.K8sControllerLogicalName}}" "ECSCluster": "{{.ECSCluster}}" "ECSContainerName": "{{.ECSContainerName}}" "ECSTaskVersion": "{{.ECSTaskVersion}}" "ECSTaskFamily": "{{.ECSTaskFamily}}" "DockerContainerName": "{{.DockerContainerName}}" "SourceAttributes": "{{.SourceAttributes}}" "ConfigID": "{{.ConfigID}}" "EDAC": "{{.EDAC}}" "Epoch": "{{.Epoch}}" "Host": "{{.Host}}" "MetricName": "{{.MetricName}}" "Source": "{{.Source}}" "SourceType": "{{.SourceType}}" "Tag": "{{.Tag}}" # organization level integrations can be referred via their name. in the example below "ed-alert-slack" is an org level slack integration - integration_name: ed-alert-slack suppression_window: 30m suppression_mode: local notify_content: advanced_content: | { "blocks": [ { "type": "section", "text": { "type": "mrkdwn", "text": "*{{.Tag}}: {{.ThresholdDescription}}*" } }, { "type": "section", "text": { "type": "mrkdwn", "text": "*MatchedTerm* {{.MatchedTerm}}" } } ] } # anomaly captures can be sent to generic webhook endpoints with custom JSON payload such as pagerduty/moogsoft - name: pagerduty-integration type: pagerduty endpoint: https://api.pagerduty.com/incidents custom_headers: Accept: "application/vnd.pagerduty+json;version=2" Content-Type: "application/json" Authorization: "Token token=XXXXXXXXXX" # pagerduty API token From: "user@edgedelta.com" # user must be a member of the pagerduty account used notify_content: advanced_content: | { "incident": { "type": "incident", "title": "{{.Tag}} {{.Host}} Disk usage at 85%", "service": { "id": "", "type": "service_reference" }, "body": { "type": "incident_body", "details": "A disk is getting full on this machine. You should investigate what is causing the disk to fill, and ensure that there is an automated process in place for ensuring data is rotated (eg. logs should have logrotate around them). If data is expected to stay on this disk forever, you should start planning to scale up to a larger disk." } } } - name: error-anomaly-moogsoft-webhook type: webhook endpoint: "localhost" # basic auth username username: user1 # basic auth password password: 12345 custom_headers: header1: value1 # JSON template variables available: Tag, Host, MetricName, Source, SourceType, EDAC, Epoch, ConfigID # Templating only applies to string fields. # For moogsoft mapping check: # https://docs.moogsoft.com/AIOps.6.5.0/Alerts_Overview.html # https://docs.moogsoft.com/en/webhook.html # TODO: IP or location for source_id and agent_location payload: signature: "{{.MetricName}}" source_id: "{{.Host}}" external_id: "{{.EDAC}}" manager: "edgedelta" source: "{{.Host}}" class: "application" agent_location: "{{.Host}}" type: "{{.SourceType}}" severity: 3 description: "high network utilization in application A" agent_time: "{{.Epoch}}" - name: moogsoft-default type: moogsoft endpoint: "localhost" api_key: "moogsoft-apikey" notify_content: custom_fields: "jira-ticket": "ticket" # default payload taken from RemdeyAPI Helix Document, might be subject to change. # https://docs.bmc.com/docs/itsm2102/example-of-using-the-rest-api-to-create-an-incident-entry-974495996.html - name: remedy-default type: remedy endpoint: "localhost" token: remedy-token notify_content: custom_fields: "test-field": "test" custom_headers: X-header1: "test-header" - name: bigpanda-default type: bigpanda endpoint: "https://api.bigpanda.io/data/v2/alerts" token: panda-token app_key: panda-app-key notify_content: custom_fields: "test-field": "test" custom_headers: X-header1: "test-header" - name: eventhub-test type: eventhub endpoint: https://eventshub-test.servicebus.windows.net/test/messages token: "test-token" notify_content: custom_fields: "test-field": "test" custom_headers: X-header1: "test-header" - name: email type: webhook endpoint: "https://api.edgedelta.com/v1/orgs/-/triggers/emails" suppression_window: 30m suppression_mode: local # Custom headers and payload support templating. # Template variables available: Tag, Host, MetricName, Source, SourceType, EDAC, Epoch, ConfigID custom_headers: X-ED-Config-Id: "{{.ConfigID}}" # Payload is a dictionary of key value pairs. Values can be string, numbers or lists. # Only string valued fields support templating. # Payload will be converted to a json and sent in the body of the request. payload: RequestID: "{{.ConfigID}}-{{.Epoch}}" ConfigID: "{{.ConfigID}}" Recipients: ["taylan@edgedelta.com"] Subject: "Anomaly Detected - EDAC [{{.EDAC}}]" Body: |

Edgedelta found anomaly!

EDAC: {{.EDAC}}
Tag: {{.Tag}}
Host: {{.Host}}
Source: {{.Source}}
MetricName: {{.MetricName}}
- name: microsoft-teams-integration type: teams endpoint: "https://outlook.office.com/webhookb2/XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX@XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX/IncomingWebhook/XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX/XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX" notify_content: title: "Anomaly Detected: {{.ProcessorDescription}}" disable_default_fields: false custom_fields: "Dashboard": "https://app.edgedelta.com/investigation?edac={{.EDAC}}×tamp={{.Timestamp}}" "Current Value": "{{.CurrentValue}}" "Threshold Value": "{{.ThresholdValue}}" "Custom Message": "{{.CurrentValue}} exceeds {{.ThresholdValue}}" "Matched Term": "{{.MatchedTerm}}" - name: jira-integration type: jira endpoint: "https://automation.codebarrel.io/pro/hooks/XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX" notify_content: advanced_content: | { "data": { "title": "{{.Tag}}", "message": "{{.MatchedTerm}}", } } - name: aws-lambda-integration type: awslambda endpoint: "https://XXXXXXXXXX.execute-api.XXXXXXXXX.amazonaws.com/XXXX/XXXXXX" notify_content: advanced_content: | { "foo": "bar", } - name: azure-functions-integration type: azurefunctions endpoint: "https://XXXXXXXXXX.azurewebsites.net/XXXX/XXXXXX" notify_content: advanced_content: | { "foo": "bar", } - name: victorops-integration type: victorops endpoint: "https://api.victorops.com/api-public/v1/incidents" # https://portal.victorops.com/dash/edgedelta#/api-management # Use the "api id" together with an "api key" when making API calls. custom_headers: X-VO-Api-Id: "api-id" X-VO-Api-Key: "api-key" notify_content: advanced_content: | { "summary": "{{ .Title }} - {{ .Message }}", "details": "https://app.edgedelta.com/investigation?edac={{.EDAC}}×tamp={{.Timestamp}}", "userName": "username", "targets": [ { "type": "EscalationPolicy", "slug": "team-xxxxxxx" } ], "isMultiResponder": false } # Archive destinations are specified in this section. There can only be at most one workflow with archive destinations # Raw logs are compressed and sent periodically when one of these conditions hit: # - Compressed logs size reaches 16MB (configurable via ED_ARCHIVE_MAX_BYTE_LIMIT) # - 5 minutes passes (configurable via ED_ARCHIVE_FLUSH_INTERVAL) # # The destination bucket will receive compressed logs as gzip file every 5min (or more frequent on high volume environments). Max file size is 16MB. # The gzip files have json lines containing raw log and other source attributes # folder path format: {bucket}/{year}/{month}/{day}/{hour}/{tag}/{host}/{random id}.log.gz # example: testbucket/2021/05/14/21/prod/host-1/1sXpYZPs83808oGZPJExJXlLXrb.log.gz archives: - name: my-s3 type: s3 aws_key_id: '{{ Env "AWS_KEY_ID" }}' aws_sec_key: '{{ Env "AWS_SECRET_KEY" }}' bucket: testbucket region: us-east-2 # This s3 archiver is getting creds by assuming a role which is already created by the customer. ref:https://docs.aws.amazon.com/STS/latest/APIReference/API_AssumeRole.html - name: my-s3-assumes-role type: s3 # role_arn is used for assuming an iam role. To see how it works ref: https://docs.aws.amazon.com/IAM/latest/UserGuide/tutorial_cross-account-with-roles.html role_arn: "arn:aws:iam::1234567890:role/ed-s3-archiver-role" # external_id increases the security of the role by requiring an optional external identifier, which prevents "confused deputy" attacks. external_id: "053cf606-8e80-47bf-b849-8cd1cc826cfc" bucket: testbucket region: us-east-2 # GCS destination authentication is done using HMAC keys for service accounts. # See https://cloud.google.com/storage/docs/authentication/managing-hmackeys for details on how to create new keys - name: my-gcs type: gcs hmac_access_key: my_hmac_access_key_123 hmac_secret: my_hmac_secret_123 bucket: ed-test-bucket # Azure blob archiver - name: my-blob type: blob account_name: '{{ Env "BLOB_ACCOUNT_NAME" }}' account_key: '{{ Env "BLOB_ACCOUNT_KEY" }}' container: testcontainer auto_create_container: false # Minio - name: my-minio type: minio access_key: my_access_key_123 secret_key: my_secret_key_123 endpoint: play.minio.com:9000 bucket: ed-test-bucket-minio disable_ssl: true # Force archive destination to use {endpoint}/{bucket} format instead of {bucket}.{endpoint}/ when reaching buckets. s3_force_path_style: true encoding: parquet # supported ones: json, parquet compression: zstd # supported ones: gzip, zstd, snappy, uncompressed # Digital Ocean Spaces - name: my-digitalocean-spaces type: dos endpoint: nyc3.digitaloceanspaces.com bucket: ed-test-bucket-dos access_key: my_access_key_123 secret_key: my_secret_key_123 # IBM Object Storage - name: my-ibm-object-storage type: ibmos endpoint: s3-api.us-geo.objectstorage.softlayer.net bucket: ed-test-bucket-ibm access_key: my_access_key_123 secret_key: my_secret_key_123 # Zenko CloudServer - name: my-zenko-cloudserver type: zenko endpoint: https://XXXXXXXXXX.sandbox.zenko.io bucket: ed-test-bucket-zenko access_key: my_access_key_123 secret_key: my_secret_key_123 - name: local-archive type: localstorage mounted_path: "/test/path/i/can/write" # filters do the filtering before processors handle their jobs. # their names should be unique within filters section filters: - name: error type: regex # type declaration is optional for regex filters because default type is regex. pattern: "error|ERROR|problem|ERR|Err" - name: info pattern: "info|INFO" - name: warn pattern: "warn|WARN" - name: error_or_info pattern: "error|info" - name: not_trace pattern: "TRACE" negate: true - name: appinsight_trace_filter type: buffered-trace # trace_id_pattern must be a regex with single capture group. trace_id_pattern: "\"operation_Id\": \"(?P\\w+)\"" # failure_pattern is the regex pattern which is used to determine whether the trace events should be considered failure. # all failed traces pass the filter. failure_pattern: \"status\":\"Failed\" # Latency pattern is the optional regex pattern which extracts latency value from trace events. latency_pattern: "\"latency\": \"(?P\\d+)\"" # all high latency traces exceed the given value pass the filter. latency_threshold: 500 # success_sample_rate is a floating number between 0 and 1. Default is 0.0. success_sample_rate: 0.2 # trace_deadline is the deadline to wait after last event for a trace. Once deadline is reached filtering/sampling will be applied to the events of trace trace_deadline: 1m - name: mask_card type: mask # predefined data types(credit_card, us_phone_dash) can be used by only specifiying their names. predefined_pattern: credit_card - name: mask_phone type: mask predefined_pattern: us_phone_dash mask: 'XXXXX' - name: mask_password type: mask # rules with user given names to match custom patterns require pattern regex. pattern: 'password:\s*(?P\w+)' # The captured group names in the regex pattern will be replaced with the corresponding value specified in mask_captured. # If 'mask' is defined, this feature will be suppressed by it. mask_captured: pw: '******' - name: mask_email type: mask pattern: 'email:\s*(?P\w+)' mask_captured: email: '******' - name: extract_severity type: extract-json-field # Field path is a dot separated path of the field (i.e. "log.message"). Its value will be extracted and the original json content will be discarded field_path: "severity" - name: drop_some_fields type: drop-json-fields # Each field path is a dot separated path of the field (i.e. "log.source"). The fields will be removed from the original content and not processed. field_paths: - "level" - "details" - "log.source" # Custom attribute filter do all the log filtering with given key-value of the attribute - name: custom_attributes_filter type: custom-attributes key: service value: billing - name: negate_custom_attributes_filter type: custom-attributes key: component # A comma separated values to match. If any of them matches the given attribute's value then the log will be pass through value: credithandler,debithandler # Negate is also supported for attribute filter negate: true # Filtering custom attributes also support regex matching - name: regex_custom_attributes_filter type: custom-attributes key: level pattern: "error|ERROR|problem|ERR|Err" # Filter combination allows grouping filters using and/or operators. Only 1 operator can be used per combination - name: combination_filter_or type: combination operator: or # filters being listed in a combination can be name of another filter or an in-line filter filters_list: - filter_name: error - filter_name: custom_attributes_filter - name: negate_combination_filter_and type: combination operator: and filters_list: - pattern: "debug|DEBUG" negate: true # A combination filter cannot be defined as an in-line filter # To use nested combination filter, define the inner filter before this and mention it using filter_name - filter_name: negate_custom_attributes_filter # filter combination also support negating negate: true # all monitoring rules are defined in this section. # their names should be unique within rules section processors: cluster: name: clustering # clustering reports top N and bottom N clusters. N = num_of_clusters num_of_clusters: 100 # keep last 20 messages of each cluster samples_per_cluster: 20 # report cluster samples every 30 seconds reporting_frequency: 30s # default retention is 1h for clusters. # if it's set as 10 minutes for example, clusters that don't have any new logs for last 10 minutes will be dropped. retention: 10m # enables cpu aware rate limiter which throttle logs when agent cpu exceeds soft_cpu_limit defined in edsettings section. # by default this is disabled and a static rate limiter is enabled which allows at most 200 logs per second from single source. # if you want to maximize the sample size of edgedelta clustering then turn on cpu_friendly and set soft_cpu_limit to a few percent less than desired agent CPU usage. cpu_friendly: true # Puts a hard limit on how many logs should be clustered per second from a single source. If cpu_friendly is enabled then this will be ignored. throttle_limit_per_sec: 200 regexes: # supported trigger thresholds: anomaly_probability_percentage, upper_limit_per_interval, lower_limit_per_interval, consecutive # ---------------- Simple Keyword Match Processor ---------------- - name: "error-regex" pattern: "error|ERROR|problem|ERR|Err" interval: 2m retention: 4h anomaly_confidence_period: 1h anomaly_tolerance: 0.2 only_report_nonzeros: true description: "Counts of messages including error per 2 minutes." trigger_thresholds: anomaly_probability_percentage: 90 # hits this condition if anomaly score > 90 upper_limit_per_interval: 250 # hits this condition if matched event count > 250 for last recording interval (10s). consecutive: 5 # if any of the other thresholds conditions are hit 5 times in a row then it will trigger alert. Default is 0 so the any condition hit would cause alert - name: "severity_high" # counts all logs whose severity matches with "HIGH|high". pattern: "HIGH|high" filters: - extract_severity # Notice that there's a field extractor filter specified. This regex rule will be working on the severity field values not the whole json log content. # ---------------- Dimension Counter Processor Examples ---------------- # if named captures in regex pattern are dimensions then occurrence stats will be generated. # in this example occurrence count for each HTTP method will be generated. # http_method_get.count, http_method_post.count... - name: "http-method" pattern: "] \"(?P\\w+)" dimensions: ["method"] # enabled_stats can be used to specify the stats generated from a regex rule. # defaults: # count, anomaly1 and anomaly2 metrics are generated for occurrence captures # count, min, max, avg, anomaly1 and anomaly2 metrics are generated for numeric captures # special stat types: # anomalymin: it takes min of anomaly1 and anomaly2. useful to reduce the alert noise enabled_stats: ["count", "anomalymin"] trigger_thresholds: lower_limit_per_interval: 2 # triggers if matched event count < 2 for last recording interval (10s). filters: - info # This is another dimension counter with dimensions_as_attributes: true. # The metrics generated from this processor has the same name but different attribute values. The dimension key/value is sent as an attribute. # By default, there is only a single attribute per metric. To group up multiple attribute use "dimensions_groups" # Sample generated metrics are following # http.count 1 {method="get"} # http.anomaly1 25 {method="get"} # http.count 1 {method="post"} # http.anomaly1 25 {method="post"} # http.count 2 {httpversion="1.1"} # http.anomaly1 25 {httpversion="1.1"} # http.count 2 {code="200"} # http.anomaly1 25 {code="200"} - name: "http-single" pattern: "] \"(?P\\w+) (?P\\S*) (?P\\S*)\" (?P\\d+)" dimensions: ["method", "httpversion", "code"] dimensions_as_attributes: true # An example of dimension counter with the use of dimensions_groups to group up attributes for metrics # dimensions_as_attribute must be enabled in order to use dimensions_groups # There can be a single or multiple dimensions groups # Sample generated metrics are following # http-group.count 1 {method="get", code="200"} # http-group.anomaly1 25 {method="get", code="200"} # http-group.count 1 {method="post", code="200"} # http-group.anomaly1 25 {method="post", code="200"} # http-group.count 1 {method="get", httpversion="1.1"} # http-group.anomaly1 25 {method="get", httpversion="1.1"} # http-group.count 1 {method="post", httpversion="1.1"} # http-group.anomaly1 25 {method="post", httpversion="1.1"} - name: "http-group" pattern: "] \"(?P\\w+) (?P\\S*)\" (?P\\d+)" dimensions: ["method", "httpversion", "code"] dimensions_as_attributes: true dimensions_groups: - selected_dimensions: ["method", "code"] - selected_dimensions: ["method", "httpversion"] # ---------------- Dimension Numeric Capture Processor Example ---------------- # if both dimension and numeric captures defined in regex pattern then numeric stats per dimension per numeric value will be generated. # in this example numeric stats for each HTTP method will be generated. # http_request_method_get_latency.[avg|min|max|p95|p99|sum], http_request_method_post_latency.[avg|min|max|p95|p99|sum], - name: "http-request-latencies" pattern: "] \"(?P\\w+) took (?P\\d+) ms" dimensions: ["method"] # interval is 1 minute by default. Processing rule will collect all captured values # and at the end of this interval it will calculate metrics. e.g. calculate average, quantiles, anomaly scores for that duration. interval: 1m # retention is 3 hour by default. Anomaly scores are calculated based on the history of metrics. # keeping retention short will be more sensitive to spikes in metric values. retention: 1h # skip_empty_intervals will skip the intervals with no match so the overall average/stddev is not affected skip_empty_intervals: true trigger_thresholds: anomaly_probability_percentage: 1 # ---------------- Numeric Capture Processor Examples ---------------- - name: "flog" # capture matcher parsing both status code and responsize in flog format as numeric values name. pattern: " (?P\\d+) (?P\\d+)$" #metric names flog_statuscode, flog_responsesize. trigger_thresholds: anomaly_probability_percentage: 1 - name: "http-response-size" # capture matcher parsing responsize in flog format as http-response-size (unnamed single group). pattern: " (\\d+)$" trigger_thresholds: anomaly_probability_percentage: 1 ratios: # supported trigger thresholds: anomaly_probability_percentage, upper_limit_per_interval, lower_limit_per_interval, consecutive - name: request-error-ratio # calculates ratio with following formula: failure / (failure+success) success_pattern: "request succeeded" failure_pattern: "request failed" trigger_thresholds: anomaly_probability_percentage: 50 traces: # supported trigger thresholds: max_duration, anomaly_probability_percentage, upper_limit_per_interval, lower_limit_per_interval, consecutive - name: login-trace start_pattern: "user (?P[0-9a-fA-F]{8}) logged in" finish_pattern: "user (?P[0-9a-fA-F]{8}) logged out" trigger_thresholds: max_duration: 50000 # 50 seconds security: - name: failed_login_detector # Regex to match: 53.138.44.43 - gorczany8437 [13/May/2020:20:12:03 +0000] "GET /granular/implement HTTP/2.0" 201 79206 "https://www.regionaloptimize.io/bleeding-edge/implement/matrix" "Mozilla/5.0 (Macintosh; PPC Mac OS X 10_8_9 rv:6.0; en-US) AppleWebKit/532.7.7 (KHTML, like Gecko) Version/6.2 Safari/532.7.7" pattern: (?P\S+)\s-\s+(?P\S+\s+)+\[(?P[^]]+)\]\s"(?P\S*)\s?(?P(?:[^"]*(?:\\")?)*)\s(?P[^"]*)"\s(?P\d+)\s(?P\d+)\s"(?P(?:[^"]*(?:\\")?)*)"\s"(?P.*)" detection_filter: field: status # to be used in with match pattern match_pattern: "401|403" # matching lines will be stored as offender threshold: 10 # if their match exceed given count window: 30m # in the specified window signature_field: ip # any matching elements with this signature will be filtered and transferred to configured destination top_ks: - name: top-api-requests # logs matching this pattern will be selected and named groups combined together will be the key of the record for which we keep counter. # example log to match below rule: "12.195.88.88 - joe [08/Aug/2020:05:57:49 +0000] "GET /optimize/engage HTTP/1.0" 200 19092" pattern: (?P\d+\.\d+\.\d+\.\d+) - \w+ \[.*\] "(?P\w+) (?P.+) HTTP\/\d.0" (?P.+) \d+ # every interval the top records will be reported and they will be removed locally interval: 30s # records are ordered by their count descendingly and top k items are picked for reporting. k: 10 # if a lower limit is provided only records whose count is greater than the limit will be able to make it to top k. lower_limit: 1 # separator is used to combine the named group values together to form a record key. Default is comma ','. # For example, # the pattern above would generate a record key like this "12.195.88.88,GET,/optimize/engage,200" # let's say this record has been seen 5 times in last period and it was one of the top k items. then below log will be reported: # "12.195.88.88,GET,/optimize/engage,200=5" separator: "," heavy_aggregators: # heavy aggregators should be put in a workflow whose input is ed_port which collects metrics from node level agents. # it groups the metrics by logical source and applies given aggregate function. It can also calculate anomalies if needed. - name: cluster-error-aggregator # this is the metric to collect and aggregate. metric_name: error.count # aggregation function is applied when metric is grouped by logical source. Available aggregate functions are sum, avg, min, max. aggregation: sum # interval is the period of aggregation. recommended to be set same as source metric rule's interval. interval: 10s # offset is offset from now. it should be set to a few seconds higher than source metric rule's interval. # when aggregator kicks in, it will select all metrics between [now-offset-interval, now-offset) offset: 7s # retention is used for anomaly calculations. It should be high enough (> 1000 x interval) to get reiable/non-noisy anomaly scores. retention: 3h # when set to true aggregated metric's anomaly1 score will be calculated and reported. # otherwise only the aggregate metric is reported at every interval. enable_anomalies: true anomaly_detectors: - name: container-cpu-anomalies retention: 1h metric_name: cpu_host_perc.value trigger_thresholds: anomaly_probability_percentage: 90 thresholds: # supported operators: '==', '>', '>=', '<', '<=' - name: http-latencyp95-threshold metric_name_pattern: http_request_method_.*_latency\.p95 # regex pattern operator: ">" value: 120 - name: http-avg-threshold # Either metric_name or metric_name_pattern must be provided but not both. metric_name: http_request_method_getconfig_latency.avg # exact metric name match operator: ">=" value: 50 - name: cluster-errors-threshold metric_name: error.anomaly1 operator: ">" value: 80 - name: incoming-lines-threshold metric_name: incoming_lines.anomaly1 operator: ">" value: 90 - name: incoming-bytes-threshold metric_name: incoming_bytes.anomaly2 operator: ">" value: 90 - name: consecutive-bytes-threshold metric_name: incoming_bytes.anomaly2 operator: ">" value: 90 # The threshold must condition must be met this many times in a row to trigger an alert. consecutive: 5 # A multi-conditions threshold can be defined as follow # Only when all the conditions are hit an alert is triggered # metric_names must belong to same source and workflow # metric_name_regex will be supported for multi-conditions threshold in the future # A custom interval for flushing out state of the conditions (triggered/not triggered) in a multi-conditions threshold can be defined # default for this interval is 5s - name: cluster-errors-multi-threshold type: and interval: 1m conditions: - metric_name: http_request_method_updateconfig_latency.avg operator: ">=" value: 100 - metric_name: http_request_method_deleteconfig_latency.max operator: ">" value: 125 consecutive: 5 # all workflows are defined in this section. # a workflow consists of labels, outputs and rules. # an input matches with a workflow if there's any intersection between input's labels and workflow's input_labels. # outputs are referred by their unique name # # Note about incoming line/byte metrics: A source can be added to multiple workflows however # only the first workflow that a source appears in will be used for incoming line/byte metric related operations. # Workflows are ordered by name ascendingly and first workflow is picked for incoming line/byte metric related operations. # - incoming line/byte metrics are reported to the destinations of the first workflow that the source appears in. # - incoming line/byte metrics based thresholds, which are specified in thresholds section of the workflow, are honored only for the first workflow that the source appears in. workflows: # This one demonstrates a workflow that applies a generic rule to all sources containing same label. error-anomaly-workflow: filters: # filters are applied before the processors - error input_labels: - errorcheck processors: - error-regex destinations: - '{{ Env "TEST_SUMO" }}' - error-anomaly-slack - error-anomaly-moogsoft-webhook - pagerduty-integration billing-http-requests-workflow: input_labels: - billing processors: - http-request-latencies thresholds: - http-latencyp95-threshold - http-avg-threshold - incoming-lines-threshold - incoming-bytes-threshold destinations: - '{{ Env "TEST_SUMO" }}' - email # This workflow applies clustering only to the input sources from our billing application billing-cluster-workflow: input_labels: - billing processors: - clustering destinations: - datadog-default # Edgedelta agent collects system and docker metrics. This workflow specify where those raw metrics should be sent. # No metric processing is supported at this moment. stats-workflow: input_labels: - system - docker - agent - infa-processes processors: - container-cpu-anomalies destinations: - '{{ Env "TEST_SUMO" }}' # Edgedelta agent keeps its components' health data including status, success and error counts and their short historic values like past 10m. health_stats_workflow: input_labels: - agent-components-health destinations: - datadog-custom heartbeat_metrics_workflow: input_labels: - agent-heartbeat destinations: - '{{ Env "TEST_SUMO" }}' # This is an example workflow which pushes error regex rule metrics to heavy agent endpoint node-level-error-workflow: input_labels: - errorcheck processors: - error-regex destinations: - edport-with-endpoint - edport-with-tcp # This is an example workflow which is marked as cluster agent only. The agent with ED_CLUSTER_AGENT=1 environment variable will run this workflow # Workflow listens error metrics from ed_port endpoint, runs aggregator on error.count metrics, checks thresholds and pushes results to sumo/slack cluster-level-error-workflow: cluster_agent_only: true input_labels: - error-counts-per-node processors: - cluster-error-aggregator thresholds: - cluster-errors-threshold destinations: - '{{ Env "TEST_SUMO" }}' - error-anomaly-slack # This is an example workflow which only runs on the cluster agent. # Workflow's inputs are the k8s events monitored via k8s list events API. # In order for the k8s events to be monitored the agent needs access to cluster API. # Example k8s spec can be found here which shows how to deploy cluster agent: https://gist.github.com/taylanisikdemir/e4335bc85b506a58d93d72fd6e82d7d0 # Currently we don't have a processor implementation to make those events useful. TODO(ED-830) cluster-events-workflow: cluster_agent_only: true input_labels: - k8s-events processors: # TODO: destinations: - '{{ Env "TEST_SUMO" }}' # archiving workflow forwards raw logs to configured archive destinations # there can be at most one workflow with one or more archive destinations archiving-workflow: filters: - info input_labels: - billing destinations: - my-s3 - my-gcs # User can define an expiring workflow and set the expiration time in workflow definition. # By using expiring workflow you can also enable log forwarding until a specific time. # Note The "expires_in" time format must be in RFC3339 format. log_forward_workflow: description: "test for time limit" input_labels: - system - docker - agent - infa-processes filters: - info destinations: - '{{ Env "TEST_SUMO" }}' expires_in: 2021-06-01T12:00:00.000Z