categraf/inputs/docker/docker.go

806 lines
22 KiB
Go

package docker
import (
"context"
"encoding/json"
"errors"
"fmt"
"io"
"log"
"strings"
"sync"
"sync/atomic"
"time"
"flashcat.cloud/categraf/config"
"flashcat.cloud/categraf/inputs"
"flashcat.cloud/categraf/pkg/choice"
"flashcat.cloud/categraf/pkg/dock"
"flashcat.cloud/categraf/pkg/filter"
tlsx "flashcat.cloud/categraf/pkg/tls"
itypes "flashcat.cloud/categraf/types"
"github.com/docker/docker/api/types"
"github.com/docker/docker/api/types/filters"
"github.com/docker/docker/api/types/swarm"
"github.com/toolkits/pkg/container/list"
)
const inputName = "docker"
// KB, MB, GB, TB, PB...human friendly
const (
KB = 1000
MB = 1000 * KB
GB = 1000 * MB
TB = 1000 * GB
PB = 1000 * TB
)
var (
// sizeRegex = regexp.MustCompile(`^(\d+(\.\d+)*) ?([kKmMgGtTpP])?[bB]?$`)
containerStates = []string{"created", "restarting", "running", "removing", "paused", "exited", "dead"}
containerMetricClasses = []string{"cpu", "network", "blkio"}
)
type Docker struct {
config.Interval
counter uint64
waitgrp sync.WaitGroup
Instances []*Instance `toml:"instances"`
}
func init() {
inputs.Add(inputName, func() inputs.Input {
return &Docker{}
})
}
func (d *Docker) Prefix() string {
return ""
}
func (d *Docker) Init() error {
if len(d.Instances) == 0 {
return itypes.ErrInstancesEmpty
}
for i := 0; i < len(d.Instances); i++ {
if err := d.Instances[i].Init(); err != nil {
return err
}
}
return nil
}
func (d *Docker) Drop() {}
func (d *Docker) Gather(slist *list.SafeList) {
atomic.AddUint64(&d.counter, 1)
for i := range d.Instances {
ins := d.Instances[i]
d.waitgrp.Add(1)
go func(slist *list.SafeList, ins *Instance) {
defer d.waitgrp.Done()
if ins.IntervalTimes > 0 {
counter := atomic.LoadUint64(&d.counter)
if counter%uint64(ins.IntervalTimes) != 0 {
return
}
}
ins.gatherOnce(slist)
}(slist, ins)
}
d.waitgrp.Wait()
}
type Instance struct {
Labels map[string]string `toml:"labels"`
IntervalTimes int64 `toml:"interval_times"`
Endpoint string `toml:"endpoint"`
GatherServices bool `toml:"gather_services"`
GatherExtendMemstats bool `toml:"gather_extend_memstats"`
ContainerIDLabelEnable bool `toml:"container_id_label_enable"`
ContainerIDLabelShortStyle bool `toml:"container_id_label_short_style"`
PerDeviceInclude []string `toml:"perdevice_include"`
TotalInclude []string `toml:"total_include"`
TagEnvironment []string `toml:"tag_env"`
LabelInclude []string `toml:"docker_label_include"`
LabelExclude []string `toml:"docker_label_exclude"`
ContainerInclude []string `toml:"container_name_include"`
ContainerExclude []string `toml:"container_name_exclude"`
ContainerStateInclude []string `toml:"container_state_include"`
ContainerStateExclude []string `toml:"container_state_exclude"`
Timeout config.Duration
tlsx.ClientConfig
client Client
labelFilter filter.Filter
containerFilter filter.Filter
stateFilter filter.Filter
}
func (ins *Instance) Init() error {
c, err := ins.getNewClient()
if err != nil {
return err
}
ins.client = c
err = choice.CheckSlice(ins.PerDeviceInclude, containerMetricClasses)
if err != nil {
return fmt.Errorf("error validating 'perdevice_include' setting : %v", err)
}
err = choice.CheckSlice(ins.TotalInclude, containerMetricClasses)
if err != nil {
return fmt.Errorf("error validating 'total_include' setting : %v", err)
}
if err = ins.createLabelFilters(); err != nil {
return err
}
if err = ins.createContainerFilters(); err != nil {
return err
}
if err = ins.createContainerStateFilters(); err != nil {
return err
}
return nil
}
func (ins *Instance) gatherOnce(slist *list.SafeList) {
if ins.Endpoint == "" {
return
}
if ins.client == nil {
c, err := ins.getNewClient()
if err != nil {
slist.PushFront(inputs.NewSample("docker_up", 0, ins.Labels))
log.Println("E! failed to new docker client:", err)
return
}
ins.client = c
}
defer ins.client.Close()
if err := ins.gatherInfo(slist); err != nil {
slist.PushFront(inputs.NewSample("docker_up", 0, ins.Labels))
log.Println("E! failed to gather docker info:", err)
return
}
slist.PushFront(inputs.NewSample("docker_up", 1, ins.Labels))
if ins.GatherServices {
ins.gatherSwarmInfo(slist)
}
filterArgs := filters.NewArgs()
for _, state := range containerStates {
if ins.stateFilter.Match(state) {
filterArgs.Add("status", state)
}
}
// All container states were excluded
if filterArgs.Len() == 0 {
return
}
// List containers
opts := types.ContainerListOptions{
Filters: filterArgs,
}
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(ins.Timeout))
defer cancel()
containers, err := ins.client.ContainerList(ctx, opts)
if err == context.DeadlineExceeded {
log.Println("E! failed to gather container list: timeout")
return
}
if err != nil {
log.Println("E! failed to gather container list:", err)
return
}
// Get container data
var wg sync.WaitGroup
wg.Add(len(containers))
for _, container := range containers {
go func(c types.Container) {
defer wg.Done()
ins.gatherContainer(c, slist)
}(container)
}
wg.Wait()
}
func (ins *Instance) gatherContainer(container types.Container, slist *list.SafeList) {
// Parse container name
var cname string
for _, name := range container.Names {
trimmedName := strings.TrimPrefix(name, "/")
if !strings.Contains(trimmedName, "/") {
cname = trimmedName
break
}
}
if cname == "" {
return
}
if !ins.containerFilter.Match(cname) {
return
}
imageName, _ := dock.ParseImage(container.Image)
tags := map[string]string{
"container_name": cname,
"container_image": imageName,
// "container_version": imageVersion,
}
if ins.ContainerIDLabelEnable {
tags["container_id"] = container.ID
if ins.ContainerIDLabelShortStyle {
tags["container_id"] = hostnameFromID(container.ID)
}
}
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(ins.Timeout))
defer cancel()
r, err := ins.client.ContainerStats(ctx, container.ID, false)
if err == context.DeadlineExceeded {
log.Println("E! failed to get container stats: timeout")
return
}
if err != nil {
log.Println("E! failed to get container stats:", err)
return
}
defer r.Body.Close()
dec := json.NewDecoder(r.Body)
var v *types.StatsJSON
if err = dec.Decode(&v); err != nil {
if err != io.EOF {
log.Println("E! failed to decode output of container stats:", err)
}
return
}
// Add labels to tags
for k, label := range container.Labels {
if ins.labelFilter.Match(k) {
tags[k] = label
}
}
err = ins.gatherContainerInspect(container, slist, tags, r.OSType, v)
if err != nil {
log.Println("E! failed to gather container inspect:", err)
}
}
func (ins *Instance) gatherContainerInspect(container types.Container, slist *list.SafeList, tags map[string]string, daemonOSType string, v *types.StatsJSON) error {
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(ins.Timeout))
defer cancel()
info, err := ins.client.ContainerInspect(ctx, container.ID)
if err == context.DeadlineExceeded {
return errInspectTimeout
}
if err != nil {
return fmt.Errorf("error inspecting docker container: %v", err)
}
// Add whitelisted environment variables to tags
if len(ins.TagEnvironment) > 0 {
for _, envvar := range info.Config.Env {
for _, configvar := range ins.TagEnvironment {
dockEnv := strings.SplitN(envvar, "=", 2)
// check for presence of tag in whitelist
if len(dockEnv) == 2 && len(strings.TrimSpace(dockEnv[1])) != 0 && configvar == dockEnv[0] {
tags[dockEnv[0]] = dockEnv[1]
}
}
}
}
statefields := make(map[string]interface{})
finished, err := time.Parse(time.RFC3339, info.State.FinishedAt)
if err == nil && !finished.IsZero() {
statefields["docker_container_status_finished_at"] = finished.Unix()
} else {
// set finished to now for use in uptime
finished = time.Now()
}
started, err := time.Parse(time.RFC3339, info.State.StartedAt)
if err == nil && !started.IsZero() {
statefields["docker_container_status_started_at"] = started.Unix()
uptime := finished.Sub(started)
if finished.Before(started) {
uptime = time.Since(started)
}
statefields["docker_container_status_uptime"] = uptime.Seconds()
}
inputs.PushSamples(slist, statefields, tags, ins.Labels)
if info.State.Health != nil {
slist.PushFront(inputs.NewSample("docker_container_health_failing_streak", info.ContainerJSONBase.State.Health.FailingStreak, tags, ins.Labels))
}
ins.parseContainerStats(v, slist, tags, daemonOSType)
return nil
}
func (ins *Instance) parseContainerStats(stat *types.StatsJSON, slist *list.SafeList, tags map[string]string, ostype string) {
// memory
basicMemstats := []string{
"cache",
"rss",
"total_cache",
"total_rss",
}
extendMemstats := []string{
"active_anon",
"active_file",
"hierarchical_memory_limit",
"inactive_anon",
"inactive_file",
"mapped_file",
"pgfault",
"pgmajfault",
"pgpgin",
"pgpgout",
"rss_huge",
"total_active_anon",
"total_active_file",
"total_inactive_anon",
"total_inactive_file",
"total_mapped_file",
"total_pgfault",
"total_pgmajfault",
"total_pgpgin",
"total_pgpgout",
"total_rss_huge",
"total_unevictable",
"total_writeback",
"unevictable",
"writeback",
}
memfields := map[string]interface{}{}
for _, field := range basicMemstats {
if value, ok := stat.MemoryStats.Stats[field]; ok {
memfields["docker_container_mem_"+field] = value
}
}
if ins.GatherExtendMemstats {
for _, field := range extendMemstats {
if value, ok := stat.MemoryStats.Stats[field]; ok {
memfields["docker_container_mem_"+field] = value
}
}
}
if stat.MemoryStats.Failcnt != 0 {
memfields["docker_container_mem_fail_count"] = stat.MemoryStats.Failcnt
}
if ostype != "windows" {
memfields["docker_container_mem_limit"] = stat.MemoryStats.Limit
memfields["docker_container_mem_max_usage"] = stat.MemoryStats.MaxUsage
mem := CalculateMemUsageUnixNoCache(stat.MemoryStats)
memLimit := float64(stat.MemoryStats.Limit)
memfields["docker_container_mem_usage"] = uint64(mem)
memfields["docker_container_mem_usage_percent"] = CalculateMemPercentUnixNoCache(memLimit, mem)
} else {
memfields["docker_container_mem_commit_bytes"] = stat.MemoryStats.Commit
memfields["docker_container_mem_commit_peak_bytes"] = stat.MemoryStats.CommitPeak
memfields["docker_container_mem_private_working_set"] = stat.MemoryStats.PrivateWorkingSet
}
inputs.PushSamples(slist, memfields, tags, ins.Labels)
// cpu
if choice.Contains("cpu", ins.TotalInclude) {
cpufields := map[string]interface{}{
"docker_container_cpu_usage_total": stat.CPUStats.CPUUsage.TotalUsage,
"docker_container_cpu_usage_in_usermode": stat.CPUStats.CPUUsage.UsageInUsermode,
"docker_container_cpu_usage_in_kernelmode": stat.CPUStats.CPUUsage.UsageInKernelmode,
"docker_container_cpu_usage_system": stat.CPUStats.SystemUsage,
"docker_container_cpu_throttling_periods": stat.CPUStats.ThrottlingData.Periods,
"docker_container_cpu_throttling_throttled_periods": stat.CPUStats.ThrottlingData.ThrottledPeriods,
"docker_container_cpu_throttling_throttled_time": stat.CPUStats.ThrottlingData.ThrottledTime,
}
if ostype != "windows" {
previousCPU := stat.PreCPUStats.CPUUsage.TotalUsage
previousSystem := stat.PreCPUStats.SystemUsage
cpuPercent := CalculateCPUPercentUnix(previousCPU, previousSystem, stat)
cpufields["docker_container_cpu_usage_percent"] = cpuPercent
} else {
cpuPercent := calculateCPUPercentWindows(stat)
cpufields["docker_container_cpu_usage_percent"] = cpuPercent
}
inputs.PushSamples(slist, cpufields, map[string]string{"cpu": "cpu-total"}, tags, ins.Labels)
}
if choice.Contains("cpu", ins.PerDeviceInclude) && len(stat.CPUStats.CPUUsage.PercpuUsage) > 0 {
var percpuusage []uint64
if stat.CPUStats.OnlineCPUs > 0 {
percpuusage = stat.CPUStats.CPUUsage.PercpuUsage[:stat.CPUStats.OnlineCPUs]
} else {
percpuusage = stat.CPUStats.CPUUsage.PercpuUsage
}
for i, percpu := range percpuusage {
slist.PushFront(inputs.NewSample(
"docker_container_cpu_usage_total",
percpu,
map[string]string{"cpu": fmt.Sprintf("cpu%d", i)},
tags,
ins.Labels,
))
}
}
// network
totalNetworkStatMap := make(map[string]interface{})
for network, netstats := range stat.Networks {
netfields := map[string]interface{}{
"docker_container_net_rx_dropped": netstats.RxDropped,
"docker_container_net_rx_bytes": netstats.RxBytes,
"docker_container_net_rx_errors": netstats.RxErrors,
"docker_container_net_tx_packets": netstats.TxPackets,
"docker_container_net_tx_dropped": netstats.TxDropped,
"docker_container_net_rx_packets": netstats.RxPackets,
"docker_container_net_tx_errors": netstats.TxErrors,
"docker_container_net_tx_bytes": netstats.TxBytes,
}
if choice.Contains("network", ins.PerDeviceInclude) {
inputs.PushSamples(slist, netfields, map[string]string{"network": network}, tags, ins.Labels)
}
if choice.Contains("network", ins.TotalInclude) {
for field, value := range netfields {
var uintV uint64
switch v := value.(type) {
case uint64:
uintV = v
case int64:
uintV = uint64(v)
default:
continue
}
_, ok := totalNetworkStatMap[field]
if ok {
totalNetworkStatMap[field] = totalNetworkStatMap[field].(uint64) + uintV
} else {
totalNetworkStatMap[field] = uintV
}
}
}
}
// totalNetworkStatMap could be empty if container is running with --net=host.
if choice.Contains("network", ins.TotalInclude) && len(totalNetworkStatMap) != 0 {
inputs.PushSamples(slist, totalNetworkStatMap, map[string]string{"network": "total"}, tags, ins.Labels)
}
ins.gatherBlockIOMetrics(slist, stat, tags)
}
func (ins *Instance) gatherBlockIOMetrics(slist *list.SafeList, stat *types.StatsJSON, tags map[string]string) {
perDeviceBlkio := choice.Contains("blkio", ins.PerDeviceInclude)
totalBlkio := choice.Contains("blkio", ins.TotalInclude)
blkioStats := stat.BlkioStats
deviceStatMap := getDeviceStatMap(blkioStats)
totalStatMap := make(map[string]interface{})
for device, fields := range deviceStatMap {
if perDeviceBlkio {
inputs.PushSamples(slist, fields, map[string]string{"device": device}, tags, ins.Labels)
}
if totalBlkio {
for field, value := range fields {
var uintV uint64
switch v := value.(type) {
case uint64:
uintV = v
case int64:
uintV = uint64(v)
default:
continue
}
_, ok := totalStatMap[field]
if ok {
totalStatMap[field] = totalStatMap[field].(uint64) + uintV
} else {
totalStatMap[field] = uintV
}
}
}
}
if totalBlkio {
inputs.PushSamples(slist, totalStatMap, map[string]string{"device": "total"}, tags, ins.Labels)
}
}
func getDeviceStatMap(blkioStats types.BlkioStats) map[string]map[string]interface{} {
deviceStatMap := make(map[string]map[string]interface{})
for _, metric := range blkioStats.IoServiceBytesRecursive {
device := fmt.Sprintf("%d:%d", metric.Major, metric.Minor)
_, ok := deviceStatMap[device]
if !ok {
deviceStatMap[device] = make(map[string]interface{})
}
field := fmt.Sprintf("docker_container_blkio_io_service_bytes_recursive_%s", strings.ToLower(metric.Op))
deviceStatMap[device][field] = metric.Value
}
for _, metric := range blkioStats.IoServicedRecursive {
device := fmt.Sprintf("%d:%d", metric.Major, metric.Minor)
_, ok := deviceStatMap[device]
if !ok {
deviceStatMap[device] = make(map[string]interface{})
}
field := fmt.Sprintf("docker_container_blkio_io_serviced_recursive_%s", strings.ToLower(metric.Op))
deviceStatMap[device][field] = metric.Value
}
for _, metric := range blkioStats.IoQueuedRecursive {
device := fmt.Sprintf("%d:%d", metric.Major, metric.Minor)
field := fmt.Sprintf("docker_container_blkio_io_queue_recursive_%s", strings.ToLower(metric.Op))
deviceStatMap[device][field] = metric.Value
}
for _, metric := range blkioStats.IoServiceTimeRecursive {
device := fmt.Sprintf("%d:%d", metric.Major, metric.Minor)
field := fmt.Sprintf("docker_container_blkio_io_service_time_recursive_%s", strings.ToLower(metric.Op))
deviceStatMap[device][field] = metric.Value
}
for _, metric := range blkioStats.IoWaitTimeRecursive {
device := fmt.Sprintf("%d:%d", metric.Major, metric.Minor)
field := fmt.Sprintf("docker_container_blkio_io_wait_time_%s", strings.ToLower(metric.Op))
deviceStatMap[device][field] = metric.Value
}
for _, metric := range blkioStats.IoMergedRecursive {
device := fmt.Sprintf("%d:%d", metric.Major, metric.Minor)
field := fmt.Sprintf("docker_container_blkio_io_merged_recursive_%s", strings.ToLower(metric.Op))
deviceStatMap[device][field] = metric.Value
}
for _, metric := range blkioStats.IoTimeRecursive {
device := fmt.Sprintf("%d:%d", metric.Major, metric.Minor)
deviceStatMap[device]["docker_container_blkio_io_time_recursive"] = metric.Value
}
for _, metric := range blkioStats.SectorsRecursive {
device := fmt.Sprintf("%d:%d", metric.Major, metric.Minor)
deviceStatMap[device]["docker_container_blkio_sectors_recursive"] = metric.Value
}
return deviceStatMap
}
func (ins *Instance) gatherSwarmInfo(slist *list.SafeList) {
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(ins.Timeout))
defer cancel()
services, err := ins.client.ServiceList(ctx, types.ServiceListOptions{})
if err == context.DeadlineExceeded {
log.Println("E! failed to gather swarm info: timeout")
return
}
if err != nil {
log.Println("E! failed to gather swarm info:", err)
return
}
if len(services) == 0 {
return
}
tasks, err := ins.client.TaskList(ctx, types.TaskListOptions{})
if err != nil {
log.Println("E! failed to gather swarm info:", err)
return
}
nodes, err := ins.client.NodeList(ctx, types.NodeListOptions{})
if err != nil {
log.Println("E! failed to gather swarm info:", err)
return
}
activeNodes := make(map[string]struct{})
for _, n := range nodes {
if n.Status.State != swarm.NodeStateDown {
activeNodes[n.ID] = struct{}{}
}
}
running := map[string]int{}
tasksNoShutdown := map[string]uint64{}
for _, task := range tasks {
if task.DesiredState != swarm.TaskStateShutdown {
tasksNoShutdown[task.ServiceID]++
}
if task.Status.State == swarm.TaskStateRunning {
running[task.ServiceID]++
}
}
for _, service := range services {
tags := map[string]string{}
fields := make(map[string]interface{})
tags["service_id"] = service.ID
tags["service_name"] = service.Spec.Name
if service.Spec.Mode.Replicated != nil && service.Spec.Mode.Replicated.Replicas != nil {
tags["service_mode"] = "replicated"
fields["docker_swarm_tasks_running"] = running[service.ID]
fields["docker_swarm_tasks_desired"] = *service.Spec.Mode.Replicated.Replicas
} else if service.Spec.Mode.Global != nil {
tags["service_mode"] = "global"
fields["docker_swarm_tasks_running"] = running[service.ID]
fields["docker_swarm_tasks_desired"] = tasksNoShutdown[service.ID]
} else {
log.Println("E! Unknown replica mode")
}
inputs.PushSamples(slist, fields, tags, ins.Labels)
}
}
func (ins *Instance) gatherInfo(slist *list.SafeList) error {
// Get info from docker daemon
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(ins.Timeout))
defer cancel()
info, err := ins.client.Info(ctx)
if err == context.DeadlineExceeded {
return errors.New("timeout")
}
if err != nil {
return err
}
fields := map[string]interface{}{
"docker_n_cpus": info.NCPU,
"docker_n_used_file_descriptors": info.NFd,
"docker_n_containers": info.Containers,
"docker_n_containers_running": info.ContainersRunning,
"docker_n_containers_stopped": info.ContainersStopped,
"docker_n_containers_paused": info.ContainersPaused,
"docker_n_images": info.Images,
"docker_memory_total": info.MemTotal,
}
inputs.PushSamples(slist, fields, ins.Labels)
return nil
}
func (ins *Instance) getNewClient() (Client, error) {
if ins.Endpoint == "ENV" {
return NewEnvClient()
}
tlsConfig, err := ins.ClientConfig.TLSConfig()
if err != nil {
return nil, err
}
c, err := NewClient(ins.Endpoint, tlsConfig)
if err != nil {
return nil, err
}
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(ins.Timeout))
defer cancel()
if _, err := c.Ping(ctx); err != nil {
return nil, err
}
return c, nil
}
func (ins *Instance) createContainerFilters() error {
containerFilter, err := filter.NewIncludeExcludeFilter(ins.ContainerInclude, ins.ContainerExclude)
if err != nil {
return err
}
ins.containerFilter = containerFilter
return nil
}
func (ins *Instance) createLabelFilters() error {
labelFilter, err := filter.NewIncludeExcludeFilter(ins.LabelInclude, ins.LabelExclude)
if err != nil {
return err
}
ins.labelFilter = labelFilter
return nil
}
func (ins *Instance) createContainerStateFilters() error {
if len(ins.ContainerStateInclude) == 0 && len(ins.ContainerStateExclude) == 0 {
ins.ContainerStateInclude = []string{"running"}
}
stateFilter, err := filter.NewIncludeExcludeFilter(ins.ContainerStateInclude, ins.ContainerStateExclude)
if err != nil {
return err
}
ins.stateFilter = stateFilter
return nil
}
func hostnameFromID(id string) string {
if len(id) > 12 {
return id[0:12]
}
return id
}
// Parses the human-readable size string into the amount it represents.
// func parseSize(sizeStr string) (int64, error) {
// matches := sizeRegex.FindStringSubmatch(sizeStr)
// if len(matches) != 4 {
// return -1, fmt.Errorf("invalid size: %s", sizeStr)
// }
// size, err := strconv.ParseFloat(matches[1], 64)
// if err != nil {
// return -1, err
// }
// uMap := map[string]int64{"k": KB, "m": MB, "g": GB, "t": TB, "p": PB}
// unitPrefix := strings.ToLower(matches[3])
// if mul, ok := uMap[unitPrefix]; ok {
// size *= float64(mul)
// }
// return int64(size), nil
// }