Kubernetes 方便的帮我们管理和运维程序,但是也带来了一些新的挑战。比如日志问题。用了Kubernetes 之后,就不能方便的登录到服务器上查看stdout日志,查看系统日志,查看程序的输出日志了。

但是 Kubernetes 有提供更系统化,更高效的日志处理方式。这里就 fluent-bit 做一下记录。

fluent-bit

Fluentd & Fluent Bit

二者都是同一家公司的两个产品,后者更适合向“云”环境。

  • Fluentd is a log collector, processor, and aggregator.
  • Fluent Bit is a log collector and processor (it doesn’t have strong aggregation features like Fluentd).

fluentd vs fluentbit

安装

  1. Docker
docker pull fluent/fluent-bit:1.3
# run the following (useless) test which makes Fluent Bit measure CPU usage by the container
docker run -ti fluent/fluent-bit:1.3 /fluent-bit/bin/fluent-bit -i cpu -o stdout -f 1

# out put
Fluent-Bit v1.3.x
Copyright (C) Treasure Data

[2019/10/01 12:29:02] [ info] [engine] started
[0] cpu.0: [1504290543.000487750, {"cpu_p"=>0.750000, "user_p"=>0.250000, "system_p"=>0.500000, "cpu0.p_cpu"=>0.000000, "cpu0.p_user"=>0.000000, "cpu0.p_system"=>0.000000, "cpu1.p_cpu"=>1.000000, "cpu1.p_user"=>0.000000, "cpu1.p_system"=>1.000000, "cpu2.p_cpu"=>1.000000, "cpu2.p_user"=>1.000000, "cpu2.p_system"=>0.000000, "cpu3.p_cpu"=>0.000000, "cpu3.p_user"=>0.000000, "cpu3.p_system"=>0.000000}]

  1. Windows
# 下载
# https://fluentbit.io/releases/1.3/td-agent-bit-1.3.2-1.AMD64.zip
# SHA256 Checksums: 57a75221b6cee25e4a85881f709462702661ab13a97e68b552ecfc1d341184b3
# 检查Hash
PS> Get-FileHash td-agent-bit-1.3.2-1.AMD64.exe

# 执行
PS> .\bin\fluent-bit.exe -i dummy -o stdout

# out put
PS> .\bin\fluent-bit.exe -i dummy -o stdout
Fluent Bit v1.3.1
Copyright (C) Treasure Data

[2019/06/28 10:13:04] [ info] [storage] initializing...
[2019/06/28 10:13:04] [ info] [storage] in-memory
[2019/06/28 10:13:04] [ info] [storage] normal synchronization mode, checksum disabled, max_chunks_up=128
[2019/06/28 10:13:04] [ info] [engine] started (pid=10324)
[2019/06/28 10:13:04] [ info] [sp] stream processor started
[0] dummy.0: [1561684385.443823800, {"message"=>"dummy"}]
[1] dummy.0: [1561684386.428399000, {"message"=>"dummy"}]
[2] dummy.0: [1561684387.443641900, {"message"=>"dummy"}]
[3] dummy.0: [1561684388.441405800, {"message"=>"dummy"}]
  1. Kubernetes
# https://raw.githubusercontent.com/fluent/fluent-bit-kubernetes-logging/master/output/elasticsearch/fluent-bit-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: fluent-bit-config
namespace: logging
labels:
k8s-app: fluent-bit
data:
# Configuration files: server, input, filters and output
# ======================================================
fluent-bit.conf: |
[SERVICE]
Flush 1
Log_Level info
Daemon off
Parsers_File parsers.conf
HTTP_Server On
HTTP_Listen 0.0.0.0
HTTP_Port 2020

@INCLUDE input-kubernetes.conf
@INCLUDE filter-kubernetes.conf
@INCLUDE output-elasticsearch.conf

input-kubernetes.conf: |
[INPUT]
Name tail
Tag kube.*
Path /var/log/containers/*.log
Parser docker
DB /var/log/flb_kube.db
Mem_Buf_Limit 5MB
Skip_Long_Lines On
Refresh_Interval 10

filter-kubernetes.conf: |
[FILTER]
Name kubernetes
Match kube.*
Kube_URL https://kubernetes.default.svc:443
Kube_CA_File /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
Kube_Token_File /var/run/secrets/kubernetes.io/serviceaccount/token
Kube_Tag_Prefix kube.var.log.containers.
Merge_Log On
Merge_Log_Key log_processed
K8S-Logging.Parser On
K8S-Logging.Exclude Off

output-elasticsearch.conf: |
[OUTPUT]
Name es
Match *
Host ${FLUENT_ELASTICSEARCH_HOST}
Port ${FLUENT_ELASTICSEARCH_PORT}
Logstash_Format On
Replace_Dots On
Retry_Limit False

parsers.conf: |
[PARSER]
Name apache
Format regex
Regex ^(?<host>[^ ]*) [^ ]* (?<user>[^ ]*) \[(?<time>[^\]]*)\] "(?<method>\S+)(?: +(?<path>[^\"]*?)(?: +\S*)?)?" (?<code>[^ ]*) (?<size>[^ ]*)(?: "(?<referer>[^\"]*)" "(?<agent>[^\"]*)")?$
Time_Key time
Time_Format %d/%b/%Y:%H:%M:%S %z

[PARSER]
Name apache2
Format regex
Regex ^(?<host>[^ ]*) [^ ]* (?<user>[^ ]*) \[(?<time>[^\]]*)\] "(?<method>\S+)(?: +(?<path>[^ ]*) +\S*)?" (?<code>[^ ]*) (?<size>[^ ]*)(?: "(?<referer>[^\"]*)" "(?<agent>[^\"]*)")?$
Time_Key time
Time_Format %d/%b/%Y:%H:%M:%S %z

[PARSER]
Name apache_error
Format regex
Regex ^\[[^ ]* (?<time>[^\]]*)\] \[(?<level>[^\]]*)\](?: \[pid (?<pid>[^\]]*)\])?( \[client (?<client>[^\]]*)\])? (?<message>.*)$

[PARSER]
Name nginx
Format regex
Regex ^(?<remote>[^ ]*) (?<host>[^ ]*) (?<user>[^ ]*) \[(?<time>[^\]]*)\] "(?<method>\S+)(?: +(?<path>[^\"]*?)(?: +\S*)?)?" (?<code>[^ ]*) (?<size>[^ ]*)(?: "(?<referer>[^\"]*)" "(?<agent>[^\"]*)")?$
Time_Key time
Time_Format %d/%b/%Y:%H:%M:%S %z

[PARSER]
Name json
Format json
Time_Key time
Time_Format %d/%b/%Y:%H:%M:%S %z

[PARSER]
Name docker
Format json
Time_Key time
Time_Format %Y-%m-%dT%H:%M:%S.%L
Time_Keep On

[PARSER]
Name syslog
Format regex
Regex ^\<(?<pri>[0-9]+)\>(?<time>[^ ]* {1,2}[^ ]* [^ ]*) (?<host>[^ ]*) (?<ident>[a-zA-Z0-9_\/\.\-]*)(?:\[(?<pid>[0-9]+)\])?(?:[^\:]*\:)? *(?<message>.*)$
Time_Key time
Time_Format %b %d %H:%M:%S

基础概念

Default Config

[SERVICE]
# Flush
# =====
# Set an interval of seconds before to flush records to a destination
Flush 5

# Daemon
# ======
# Instruct Fluent Bit to run in foreground or background mode.
Daemon Off

# Log_Level
# =========
# Set the verbosity level of the service, values can be:
#
# - error
# - warning
# - info
# - debug
# - trace
#
# By default 'info' is set, that means it includes 'error' and 'warning'.
Log_Level info

# Parsers_File
# ============
# Specify an optional 'Parsers' configuration file
Parsers_File parsers.conf
Plugins_File plugins.conf

# HTTP Server
# ===========
# Enable/Disable the built-in HTTP Server for metrics
HTTP_Server Off
HTTP_Listen 0.0.0.0
HTTP_Port 2020

[INPUT]
Name cpu
Tag cpu.local
# Interval Sec
# ====
# Read interval (sec) Default: 1
Interval_Sec 1

[OUTPUT]
Name stdout
Match *

Schema

Section 不能为空;4空格缩进;必须换行;

配置文件支持拆分和引用

@INCLUDE somefile.conf
@INCLUDE input_*.conf
  • Note that despites the order of inclusion, Fluent Bit will ALWAYS respect the following order:
  1. Service
  2. Inputs
  3. Filters
  4. Outputs

支持环境变量和自定义变量

# export MY_OUTPUT=stdout
@SET my_input=cpu
@SET my_output=stdout

[SERVICE]
Name ${MY_OUTPUT}
Match *
Flush 1

[INPUT]
Name ${my_input}

[OUTPUT]
Name ${my_output}

Buffering / Storage

  • that configuration configure an optional buffering mechanism where it root for data is /var/log/flb-storage/, it will use normal synchronization mode, without checksum and up to a maximum of 5MB of memory when processing backlog data.
[SERVICE]
flush 1
log_Level info
storage.path /var/log/flb-storage/
storage.sync normal
storage.checksum off
storage.backlog.mem_limit 5M

# Specify the buffering mechanism to use. It can be memory or filesystem.
[INPUT]
name cpu
storage.type filesystem

[INPUT]
name mem
storage.type memory

Metrics

By default configured plugins on runtime get an internal name in the format plugin_name.ID. For monitoring purposes this can be confusing if many plugins of the same type were configured. To make a distinction each configured input or output section can get an alias that will be used as the parent name for the metric.

[SERVICE]
HTTP_Server On
HTTP_Listen 0.0.0.0
HTTP_PORT 2020

[INPUT]
Name cpu
Alias server1_cpu

[OUTPUT]
Name stdout
Alias raw_output
Match *

# /api/v1/metrics/prometheus

Upstream Servers

  • It’s common that Fluent Bit output plugins aims to connect to external services to deliver the logs over the network, this is the case of HTTP, Elasticsearch and Forward within others. Being able to connect to one node (host) is normal and enough for more of the use cases, but there are other scenarios where balancing across different nodes is required. The Upstream feature provides such capability.
  • An Upstream defines a set of nodes that will be targeted by an output plugin, by the nature of the implementation an output plugin must support the Upstream feature. The following plugin(s) have Upstream support: Forward.
  • The current balancing mode implemented is round-robin.
# The following example defines an Upstream called forward-balancing which aims to be used by Forward output plugin, it register three Nodes:
[UPSTREAM]
name forward-balancing
# node-1: connects to 127.0.0.1:43000
[NODE]
name node-1
host 127.0.0.1
port 43000
# node-2: connects to 127.0.0.1:44000
[NODE]
name node-2
host 127.0.0.1
port 44000
# node-3: connects to 127.0.0.1:45000 using TLS without verification. It also defines a specific configuration option required by Forward output called shared_key.
[NODE]
name node-3
host 127.0.0.1
port 45000
tls on
tls.verify off
shared_key secret

Scheduler

Fluent Bit has an Engine that helps to coordinate the data ingestion from input plugins and call the Scheduler to decide when is time to flush the data through one or multiple output plugins. The Scheduler flush new data every a fixed time of seconds and Schedule retries when asked.

# The following example configure two outputs where the HTTP plugin have an unlimited number of retries and the Elasticsearch plugin have a limit of 5 times:
[OUTPUT]
Name http
Host 192.168.5.6
Port 8080
Retry_Limit False

[OUTPUT]
Name es
Host 192.168.5.20
Port 9200
Logstash_Format On
Retry_Limit 5

Input Plugins Tail

The plugin reads every matched file in the Path pattern and for every new line found (separated by a \n), it generates a new record. Optionally a database file can be used so the plugin can have a history of tracked files and a state of offsets, this is very useful to resume a state if the service is restarted.

[INPUT]
Name tail
Path /var/log/syslog

[OUTPUT]
Name stdout
Match *

Parser

By default, Fluent Bit provides a set of pre-configured parsers that can be used for different use cases such as logs from: Apache, Nginx, Docker, Syslog rfc5424, Syslog rfc3164.

All parsers must be defined in a parsers.conf file, not in the Fluent Bit global configuration file. The parsers file expose all parsers available that can be used by the Input plugins that are aware of this feature. A parsers file can have multiple entries like this:

[PARSER]
Name docker
Format json
Time_Key time
Time_Format %Y-%m-%dT%H:%M:%S.%L
Time_Keep On

[PARSER]
Name syslog-rfc5424
Format regex
Regex ^\<(?<pri>[0-9]{1,5})\>1 (?<time>[^ ]+) (?<host>[^ ]+) (?<ident>[^ ]+) (?<pid>[-0-9]+) (?<msgid>[^ ]+) (?<extradata>(\[(.*)\]|-)) (?<message>.+)$
Time_Key time
Time_Format %Y-%m-%dT%H:%M:%S.%L
Time_Keep On
Types pid:integer

Filter in Kubernetes

Kubernetes Annotations

apiVersion: v1
kind: Pod
metadata:
name: apache-logs
labels:
app: apache-logs
annotations:
# define parser
fluentbit.io/parser: apache
# exclude
fluentbit.io/exclude: "true"
spec:
containers:
- name: apache
image: edsiper/apache_logs

Workflow of Tail + Kubernetes Filter

[INPUT]
Name tail
Tag kube.*
Path /var/log/containers/*.log
Parser docker

[FILTER]
Name kubernetes
Match kube.*
Kube_URL https://kubernetes.default.svc:443
Kube_CA_File /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
Kube_Token_File /var/run/secrets/kubernetes.io/serviceaccount/token
Kube_Tag_Prefix kube.var.log.containers.
Merge_Log On
Merge_Log_Key log_processed

Helm Charts and Yaml

apiVersion: v1
data:
fluent-bit.conf: |
[SERVICE]
Flush 2
Daemon off
Log_Level info
Parsers_File parsers.conf
Plugins_File plugins.conf
HTTP_Server On
HTTP_Listen 0.0.0.0
HTTP_Port 2020

Modify Filter

  • The Modify Filter plugin allows you to change records using rules and conditions.
[INPUT]
Name mem
Tag mem.local

[OUTPUT]
Name stdout
Match *

[FILTER]
Name modify
Match *
Add Service1 SOMEVALUE
Add Service3 SOMEVALUE3
Add Mem.total2 TOTALMEM2
Rename Mem.free MEMFREE
Rename Mem.used MEMUSED
Rename Swap.total SWAPTOTAL
Add Mem.total TOTALMEM
  • Conditionally Add and Remove
[INPUT]
Name mem
Tag mem.local
Interval_Sec 1

[FILTER]
Name modify
Match mem.*

Condition Key_Does_Not_Exist cpustats
Condition Key_Exists Mem.used

Set cpustats UNKNOWN

[FILTER]
Name modify
Match mem.*

Condition Key_Value_Does_Not_Equal cpustats KNOWN

Add sourcetype memstats

[FILTER]
Name modify
Match mem.*

Condition Key_Value_Equals cpustats UNKNOWN

Remove_wildcard Mem
Remove_wildcard Swap
Add cpustats_more STILL_UNKNOWN

[OUTPUT]
Name stdout
Match *

Routing

There are two important concepts in Routing: Tag, Match

  • When the data is generated by the input plugins, it comes with a Tag (most of the time the Tag is configured manually), the Tag is a human-readable indicator that helps to identify the data source.
  • In order to define where the data should be routed, a Match rule must be specified in the output configuration.
# Consider the following configuration example that aims to deliver CPU metrics to an Elasticsearch database and Memory metrics to the standard output interface:
[INPUT]
Name cpu
Tag my_cpu

[INPUT]
Name mem
Tag my_mem

[OUTPUT]
Name es
Match my_cpu

[OUTPUT]
Name stdout
Match my_mem
  • Routing works automatically reading the Input Tags and the Output Match rules. If some data has a Tag that doesn’t match upon routing time, the data is deleted.

  • Routing with Wildcard: Routing is flexible enough to support wildcard in the Match pattern. The below example defines a common destination for both sources of data:

# The match rule is set to my_* which means it will match any Tag that starts with my_.
[INPUT]
Name cpu
Tag my_cpu

[INPUT]
Name mem
Tag my_mem

[OUTPUT]
Name stdout
Match my_*

Output Plugins

Elasticsearch

[INPUT]
Name cpu
Tag cpu

[OUTPUT]
Name es
Match *
Host 192.168.2.3
Port 9200
Index my_index
Type my_type

Demo

下面是一个 sidecar 模式部署的 flb 收集 nginx 日志的例子和 Yaml 文件。

大致的流程示意图:flb-sidecar

  • 注意:下面挂载的共享日志目录是 /app/log/ ,而非默认的 /var/log/nginx/ 目录。

因为默认的 nginx 镜像会把 access.log -> stdout,所以不能直接挂载。这里测试的时候,是直接进入 nginx 的容器,把 access.log 手动 copy 到 /app/log/ 目录下,再观察 fluent-bit 的 Container 会不会把 access.log 的内容输出到 stdout 上。

---
# Source: default/templates/configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: zeed-flb-default
namespace: default
labels:
app.kubernetes.io/name: default
helm.sh/chart: default-0.1.0
app.kubernetes.io/instance: zeed-flb
app.kubernetes.io/version: "1.0"
app.kubernetes.io/managed-by: Tiller
data:
# Configuration files: server, input, filters and output
# ======================================================
fluent-bit.conf: |
[SERVICE]
Flush 5
Log_Level info
Daemon off
Parsers_File parsers.conf
HTTP_Server On
HTTP_Listen 0.0.0.0
HTTP_Port 2020

[INPUT]
Name dummy
Tag dummy.log

[INPUT]
Name tail
Tag nginx.access
Parser nginx
Path /app/log/nginx-access.log

[INPUT]
Name tail
Tag nginx.error
Parser nginx
Path /app/log/nginx-error.log

[OUTPUT]
Name stdout
Match *

[OUTPUT]
Name es
Match es.nginx.*
Host 192.168.2.3
Port 9200
Index my_index
Type my_type


parsers.conf: |
[PARSER]
Name apache
Format regex
Regex ^(?<host>[^ ]*) [^ ]* (?<user>[^ ]*) \[(?<time>[^\]]*)\] "(?<method>\S+)(?: +(?<path>[^\"]*?)(?: +\S*)?)?" (?<code>[^ ]*) (?<size>[^ ]*)(?: "(?<referer>[^\"]*)" "(?<agent>[^\"]*)")?$
Time_Key time
Time_Format %d/%b/%Y:%H:%M:%S %z

[PARSER]
Name apache2
Format regex
Regex ^(?<host>[^ ]*) [^ ]* (?<user>[^ ]*) \[(?<time>[^\]]*)\] "(?<method>\S+)(?: +(?<path>[^ ]*) +\S*)?" (?<code>[^ ]*) (?<size>[^ ]*)(?: "(?<referer>[^\"]*)" "(?<agent>[^\"]*)")?$
Time_Key time
Time_Format %d/%b/%Y:%H:%M:%S %z

[PARSER]
Name apache_error
Format regex
Regex ^\[[^ ]* (?<time>[^\]]*)\] \[(?<level>[^\]]*)\](?: \[pid (?<pid>[^\]]*)\])?( \[client (?<client>[^\]]*)\])? (?<message>.*)$

[PARSER]
Name nginx
Format regex
Regex ^(?<remote>[^ ]*) (?<host>[^ ]*) (?<user>[^ ]*) \[(?<time>[^\]]*)\] "(?<method>\S+)(?: +(?<path>[^\"]*?)(?: +\S*)?)?" (?<code>[^ ]*) (?<size>[^ ]*)(?: "(?<referer>[^\"]*)" "(?<agent>[^\"]*)")?$
Time_Key time
Time_Format %d/%b/%Y:%H:%M:%S %z

[PARSER]
Name json
Format json
Time_Key time
Time_Format %d/%b/%Y:%H:%M:%S %z

[PARSER]
Name docker
Format json
Time_Key time
Time_Format %Y-%m-%dT%H:%M:%S.%L
Time_Keep On

[PARSER]
Name syslog
Format regex
Regex ^\<(?<pri>[0-9]+)\>(?<time>[^ ]* {1,2}[^ ]* [^ ]*) (?<host>[^ ]*) (?<ident>[a-zA-Z0-9_\/\.\-]*)(?:\[(?<pid>[0-9]+)\])?(?:[^\:]*\:)? *(?<message>.*)$
Time_Key time
Time_Format %b %d %H:%M:%S

---
# Source: default/templates/service.yaml
apiVersion: v1
kind: Service
metadata:
name: zeed-flb-default
namespace: default
labels:
app.kubernetes.io/name: default
helm.sh/chart: default-0.1.0
app.kubernetes.io/instance: zeed-flb
app.kubernetes.io/version: "1.0"
app.kubernetes.io/managed-by: Tiller
spec:
type: ClusterIP
ports:
- port: 80
targetPort: http
protocol: TCP
name: http
selector:
app.kubernetes.io/name: default
app.kubernetes.io/instance: zeed-flb

---
# Source: default/templates/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: zeed-flb-default
namespace: default
labels:
app.kubernetes.io/name: default
helm.sh/chart: default-0.1.0
app.kubernetes.io/instance: zeed-flb
app.kubernetes.io/version: "1.0"
app.kubernetes.io/managed-by: Tiller
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/name: default
app.kubernetes.io/instance: zeed-flb
template:
metadata:
labels:
app.kubernetes.io/name: default
app.kubernetes.io/instance: zeed-flb
spec:
containers:
- name: default
image: "nginx:stable"
imagePullPolicy: IfNotPresent
ports:
- name: http
containerPort: 80
protocol: TCP
livenessProbe:
httpGet:
path: /
port: http
readinessProbe:
httpGet:
path: /
port: http
resources:
{}
volumeMounts:
- name: log-volume
mountPath: /app/log/
- name: default-fluentbit
image: "fluent/fluent-bit:1.3"
imagePullPolicy: IfNotPresent
ports:
- name: metrics
containerPort: 2020
protocol: TCP
volumeMounts:
- name: config-volume
mountPath: /fluent-bit/etc/
- name: log-volume
mountPath: /app/log/
volumes:
- name: log-volume
emptyDir: {}
- name: config-volume
configMap:
name: zeed-flb-default

---
# Source: default/templates/ingress.yaml