categraf/prometheus/prometheus.go

897 lines
26 KiB
Go

package prometheus
import (
"context"
"errors"
"fmt"
"math"
"net"
"net/url"
"os"
"os/signal"
"strings"
"sync"
"sync/atomic"
"syscall"
"time"
coreconfig "flashcat.cloud/categraf/config"
"github.com/alecthomas/units"
"github.com/go-kit/log"
"github.com/go-kit/log/level"
"github.com/oklog/run"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/common/model"
"github.com/prometheus/common/promlog"
"github.com/prometheus/prometheus/config"
"github.com/prometheus/prometheus/discovery"
_ "github.com/prometheus/prometheus/discovery/install"
"github.com/prometheus/prometheus/model/exemplar"
"github.com/prometheus/prometheus/model/labels"
"github.com/prometheus/prometheus/notifier"
"github.com/prometheus/prometheus/scrape"
"github.com/prometheus/prometheus/storage"
"github.com/prometheus/prometheus/storage/remote"
"github.com/prometheus/prometheus/tsdb"
"github.com/prometheus/prometheus/tsdb/agent"
prom_runtime "github.com/prometheus/prometheus/util/runtime"
"github.com/prometheus/prometheus/web"
)
// config toml/yaml
type readyScrapeManager struct {
mtx sync.RWMutex
m *scrape.Manager
}
// Set the scrape manager.
func (rm *readyScrapeManager) Set(m *scrape.Manager) {
rm.mtx.Lock()
defer rm.mtx.Unlock()
rm.m = m
}
func (rm *readyScrapeManager) Get() (*scrape.Manager, error) {
rm.mtx.RLock()
defer rm.mtx.RUnlock()
if rm.m != nil {
return rm.m, nil
}
return nil, ErrNotReady
}
// readyStorage implements the Storage interface while allowing to set the actual
// storage at a later point in time.
type readyStorage struct {
mtx sync.RWMutex
db storage.Storage
startTimeMargin int64
stats *tsdb.DBStats
}
func (s *readyStorage) ApplyConfig(conf *config.Config) error {
db := s.get()
if db, ok := db.(*tsdb.DB); ok {
return db.ApplyConfig(conf)
}
return nil
}
// Set the storage.
func (s *readyStorage) Set(db storage.Storage, startTimeMargin int64) {
s.mtx.Lock()
defer s.mtx.Unlock()
s.db = db
s.startTimeMargin = startTimeMargin
}
func (s *readyStorage) get() storage.Storage {
s.mtx.RLock()
x := s.db
s.mtx.RUnlock()
return x
}
func (s *readyStorage) getStats() *tsdb.DBStats {
s.mtx.RLock()
x := s.stats
s.mtx.RUnlock()
return x
}
// StartTime implements the Storage interface.
func (s *readyStorage) StartTime() (int64, error) {
if x := s.get(); x != nil {
switch db := x.(type) {
case *tsdb.DB:
var startTime int64
if len(db.Blocks()) > 0 {
startTime = db.Blocks()[0].Meta().MinTime
} else {
startTime = time.Now().Unix() * 1000
}
// Add a safety margin as it may take a few minutes for everything to spin up.
return startTime + s.startTimeMargin, nil
default:
panic(fmt.Sprintf("unknown storage type %T", db))
}
}
return math.MaxInt64, tsdb.ErrNotReady
}
// Querier implements the Storage interface.
func (s *readyStorage) Querier(ctx context.Context, mint, maxt int64) (storage.Querier, error) {
if x := s.get(); x != nil {
return x.Querier(ctx, mint, maxt)
}
return nil, tsdb.ErrNotReady
}
// ChunkQuerier implements the Storage interface.
func (s *readyStorage) ChunkQuerier(ctx context.Context, mint, maxt int64) (storage.ChunkQuerier, error) {
if x := s.get(); x != nil {
return x.ChunkQuerier(ctx, mint, maxt)
}
return nil, tsdb.ErrNotReady
}
func (s *readyStorage) ExemplarQuerier(ctx context.Context) (storage.ExemplarQuerier, error) {
if x := s.get(); x != nil {
switch db := x.(type) {
case *tsdb.DB:
return db.ExemplarQuerier(ctx)
default:
panic(fmt.Sprintf("unknown storage type %T", db))
}
}
return nil, tsdb.ErrNotReady
}
// Appender implements the Storage interface.
func (s *readyStorage) Appender(ctx context.Context) storage.Appender {
if x := s.get(); x != nil {
return x.Appender(ctx)
}
return notReadyAppender{}
}
type notReadyAppender struct{}
func (n notReadyAppender) Append(ref storage.SeriesRef, l labels.Labels, t int64, v float64) (storage.SeriesRef, error) {
return 0, tsdb.ErrNotReady
}
func (n notReadyAppender) AppendExemplar(ref storage.SeriesRef, l labels.Labels, e exemplar.Exemplar) (storage.SeriesRef, error) {
return 0, tsdb.ErrNotReady
}
func (n notReadyAppender) Commit() error { return tsdb.ErrNotReady }
func (n notReadyAppender) Rollback() error { return tsdb.ErrNotReady }
// Close implements the Storage interface.
func (s *readyStorage) Close() error {
if x := s.get(); x != nil {
return x.Close()
}
return nil
}
// CleanTombstones implements the api_v1.TSDBAdminStats and api_v2.TSDBAdmin interfaces.
func (s *readyStorage) CleanTombstones() error {
if x := s.get(); x != nil {
switch db := x.(type) {
case *tsdb.DB:
return db.CleanTombstones()
default:
panic(fmt.Sprintf("unknown storage type %T", db))
}
}
return tsdb.ErrNotReady
}
// Delete implements the api_v1.TSDBAdminStats and api_v2.TSDBAdmin interfaces.
func (s *readyStorage) Delete(mint, maxt int64, ms ...*labels.Matcher) error {
if x := s.get(); x != nil {
switch db := x.(type) {
case *tsdb.DB:
return db.Delete(mint, maxt, ms...)
default:
panic(fmt.Sprintf("unknown storage type %T", db))
}
}
return tsdb.ErrNotReady
}
// Snapshot implements the api_v1.TSDBAdminStats and api_v2.TSDBAdmin interfaces.
func (s *readyStorage) Snapshot(dir string, withHead bool) error {
if x := s.get(); x != nil {
switch db := x.(type) {
case *tsdb.DB:
return db.Snapshot(dir, withHead)
default:
panic(fmt.Sprintf("unknown storage type %T", db))
}
}
return tsdb.ErrNotReady
}
// Stats implements the api_v1.TSDBAdminStats interface.
func (s *readyStorage) Stats(statsByLabelName string) (*tsdb.Stats, error) {
if x := s.get(); x != nil {
switch db := x.(type) {
case *tsdb.DB:
return db.Head().Stats(statsByLabelName), nil
default:
panic(fmt.Sprintf("unknown storage type %T", db))
}
}
return nil, tsdb.ErrNotReady
}
// WALReplayStatus implements the api_v1.TSDBStats interface.
func (s *readyStorage) WALReplayStatus() (tsdb.WALReplayStatus, error) {
if x := s.getStats(); x != nil {
return x.Head.WALReplayStatus.GetWALReplayStatus(), nil
}
return tsdb.WALReplayStatus{}, tsdb.ErrNotReady
}
// ErrNotReady is returned if the underlying scrape manager is not ready yet.
var ErrNotReady = errors.New("Scrape manager not ready")
type reloader struct {
name string
reloader func(*config.Config) error
}
func reloadConfig(filename string, expandExternalLabels, enableExemplarStorage bool, logger log.Logger, rls ...reloader) (err error) {
start := time.Now()
timings := []interface{}{}
level.Info(logger).Log("msg", "Loading configuration file", "filename", filename)
conf, err := config.LoadFile(filename, true, false, logger)
if err != nil {
return fmt.Errorf("%s couldn't load configuration (--config.file=%q)", err, filename)
}
if enableExemplarStorage {
if conf.StorageConfig.ExemplarsConfig == nil {
conf.StorageConfig.ExemplarsConfig = &config.DefaultExemplarsConfig
}
}
failed := false
for _, rl := range rls {
rstart := time.Now()
if err := rl.reloader(conf); err != nil {
level.Error(logger).Log("msg", "Failed to apply configuration", "err", err)
failed = true
}
timings = append(timings, rl.name, time.Since(rstart))
}
if failed {
return fmt.Errorf("one or more errors occurred while applying the new configuration (--config.file=%q)", filename)
}
l := []interface{}{"msg", "Completed loading of configuration file", "filename", filename, "totalDuration", time.Since(start)}
level.Info(logger).Log(append(l, timings...)...)
return nil
}
type flagConfig struct {
configFile string
agentStoragePath string
serverStoragePath string
notifier notifier.Options
forGracePeriod model.Duration
outageTolerance model.Duration
resendDelay model.Duration
web web.Options
scrape scrape.Options
tsdb tsdbOptions
agent agentOptions
lookbackDelta model.Duration
webTimeout model.Duration
queryTimeout model.Duration
queryConcurrency int
queryMaxSamples int
RemoteFlushDeadline model.Duration
featureList []string
// These options are extracted from featureList
// for ease of use.
enableExpandExternalLabels bool
enableNewSDManager bool
enablePerStepStats bool
enableAutoGOMAXPROCS bool
prometheusURL string
corsRegexString string
promlogConfig promlog.Config
}
// setFeatureListOptions sets the corresponding options from the featureList.
func (c *flagConfig) setFeatureListOptions(logger log.Logger) error {
for _, f := range c.featureList {
opts := strings.Split(f, ",")
for _, o := range opts {
switch o {
case "expand-external-labels":
c.enableExpandExternalLabels = true
level.Info(logger).Log("msg", "Experimental expand-external-labels enabled")
case "exemplar-storage":
c.tsdb.EnableExemplarStorage = true
level.Info(logger).Log("msg", "Experimental in-memory exemplar storage enabled")
case "memory-snapshot-on-shutdown":
c.tsdb.EnableMemorySnapshotOnShutdown = true
level.Info(logger).Log("msg", "Experimental memory snapshot on shutdown enabled")
case "extra-scrape-metrics":
c.scrape.ExtraMetrics = true
level.Info(logger).Log("msg", "Experimental additional scrape metrics")
case "new-service-discovery-manager":
c.enableNewSDManager = true
level.Info(logger).Log("msg", "Experimental service discovery manager")
case "agent":
level.Info(logger).Log("msg", "Experimental agent mode enabled.")
case "promql-per-step-stats":
c.enablePerStepStats = true
level.Info(logger).Log("msg", "Experimental per-step statistics reporting")
case "auto-gomaxprocs":
c.enableAutoGOMAXPROCS = true
level.Info(logger).Log("msg", "Automatically set GOMAXPROCS to match Linux container CPU quota")
case "":
continue
case "promql-at-modifier", "promql-negative-offset":
level.Warn(logger).Log("msg", "This option for --enable-feature is now permanently enabled and therefore a no-op.", "option", o)
default:
level.Warn(logger).Log("msg", "Unknown option for --enable-feature", "option", o)
}
}
}
return nil
}
type tsdbOptions struct {
WALSegmentSize units.Base2Bytes
MaxBlockChunkSegmentSize units.Base2Bytes
RetentionDuration model.Duration
MaxBytes units.Base2Bytes
NoLockfile bool
AllowOverlappingBlocks bool
WALCompression bool
HeadChunksWriteQueueSize int
StripeSize int
MinBlockDuration model.Duration
MaxBlockDuration model.Duration
EnableExemplarStorage bool
MaxExemplars int64
EnableMemorySnapshotOnShutdown bool
}
func (opts tsdbOptions) ToTSDBOptions() tsdb.Options {
return tsdb.Options{
WALSegmentSize: int(opts.WALSegmentSize),
MaxBlockChunkSegmentSize: int64(opts.MaxBlockChunkSegmentSize),
RetentionDuration: int64(time.Duration(opts.RetentionDuration) / time.Millisecond),
MaxBytes: int64(opts.MaxBytes),
NoLockfile: opts.NoLockfile,
AllowOverlappingBlocks: opts.AllowOverlappingBlocks,
WALCompression: opts.WALCompression,
HeadChunksWriteQueueSize: opts.HeadChunksWriteQueueSize,
StripeSize: opts.StripeSize,
MinBlockDuration: int64(time.Duration(opts.MinBlockDuration) / time.Millisecond),
MaxBlockDuration: int64(time.Duration(opts.MaxBlockDuration) / time.Millisecond),
EnableExemplarStorage: opts.EnableExemplarStorage,
MaxExemplars: opts.MaxExemplars,
EnableMemorySnapshotOnShutdown: opts.EnableMemorySnapshotOnShutdown,
}
}
// agentOptions is a version of agent.Options with defined units. This is required
// as agent.Option fields are unit agnostic (time).
type agentOptions struct {
WALSegmentSize units.Base2Bytes
WALCompression bool
StripeSize int
TruncateFrequency model.Duration
MinWALTime, MaxWALTime model.Duration
NoLockfile bool
}
func durationToInt64Millis(d time.Duration) int64 {
return int64(d / time.Millisecond)
}
func startsOrEndsWithQuote(s string) bool {
return strings.HasPrefix(s, "\"") || strings.HasPrefix(s, "'") ||
strings.HasSuffix(s, "\"") || strings.HasSuffix(s, "'")
}
func computeExternalURL(u, listenAddr string) (*url.URL, error) {
if u == "" {
hostname, err := os.Hostname()
if err != nil {
return nil, err
}
_, port, err := net.SplitHostPort(listenAddr)
if err != nil {
return nil, err
}
u = fmt.Sprintf("http://%s:%s/", hostname, port)
}
if startsOrEndsWithQuote(u) {
return nil, errors.New("URL must not begin or end with quotes")
}
eu, err := url.Parse(u)
if err != nil {
return nil, err
}
ppref := strings.TrimRight(eu.Path, "/")
if ppref != "" && !strings.HasPrefix(ppref, "/") {
ppref = "/" + ppref
}
eu.Path = ppref
return eu, nil
}
func (opts agentOptions) ToAgentOptions() agent.Options {
return agent.Options{
WALSegmentSize: int(opts.WALSegmentSize),
WALCompression: opts.WALCompression,
StripeSize: opts.StripeSize,
TruncateFrequency: time.Duration(opts.TruncateFrequency),
MinWALTime: durationToInt64Millis(time.Duration(opts.MinWALTime)),
MaxWALTime: durationToInt64Millis(time.Duration(opts.MaxWALTime)),
NoLockfile: opts.NoLockfile,
}
}
var (
stop = make(chan struct{})
isRunning int32
)
func Start() {
var (
err error
)
if atomic.LoadInt32(&isRunning) > 0 {
return
}
cfg := flagConfig{
notifier: notifier.Options{
Registerer: prometheus.DefaultRegisterer,
},
web: web.Options{
Registerer: prometheus.DefaultRegisterer,
Gatherer: prometheus.DefaultGatherer,
},
promlogConfig: promlog.Config{
Level: &promlog.AllowedLevel{},
},
}
if coreconfig.Config.DebugMode || coreconfig.Config.TestMode {
cfg.promlogConfig.Level.Set("debug")
} else {
cfg.promlogConfig.Level.Set(coreconfig.Config.Prometheus.LogLevel)
}
if cfg.promlogConfig.Level.String() == "" {
cfg.promlogConfig.Level.Set("info")
}
logger := promlog.New(&cfg.promlogConfig)
if err = cfg.setFeatureListOptions(logger); err != nil {
fmt.Fprintln(os.Stderr, fmt.Errorf("%s Error parsing feature list", err))
os.Exit(1)
}
notifierManager := notifier.NewManager(&cfg.notifier, log.With(logger, "component", "notifier"))
ctxScrape, cancelScrape := context.WithCancel(context.Background())
ctxNotify, cancelNotify := context.WithCancel(context.Background())
discoveryManagerScrape := discovery.NewManager(ctxScrape, log.With(logger, "component", "discovery manager scrape"), discovery.Name("scrape"))
discoveryManagerNotify := discovery.NewManager(ctxNotify, log.With(logger, "component", "discovery manager notify"), discovery.Name("notify"))
if cfg.scrape.ExtraMetrics {
// Experimental additional scrape metrics
// TODO scrapeopts configurable
}
localStorage := &readyStorage{stats: tsdb.NewDBStats()}
cfg.agentStoragePath = coreconfig.Config.Prometheus.StoragePath
if len(cfg.agentStoragePath) == 0 {
cfg.agentStoragePath = "./data-agent"
}
if cfg.tsdb.MinBlockDuration == model.Duration(0) {
cfg.tsdb.MinBlockDuration = model.Duration(2 * time.Hour)
}
if cfg.webTimeout == model.Duration(0) {
cfg.webTimeout = model.Duration(time.Minute * 5)
}
cfg.web.ReadTimeout = time.Duration(cfg.webTimeout)
if cfg.web.MaxConnections == 0 {
cfg.web.MaxConnections = 512
}
cfg.web.EnableAdminAPI = false
cfg.web.EnableRemoteWriteReceiver = false
if cfg.web.RemoteReadSampleLimit == 0 {
cfg.web.RemoteReadSampleLimit = 5e7
}
if cfg.web.RemoteReadConcurrencyLimit == 0 {
cfg.web.RemoteReadConcurrencyLimit = 10
}
if cfg.web.RemoteReadBytesInFrame == 0 {
cfg.web.RemoteReadBytesInFrame = 1048576
}
if len(cfg.configFile) == 0 {
cfg.configFile = coreconfig.Config.Prometheus.ScrapeConfigFile
}
scraper := &readyScrapeManager{}
remoteFlushDeadline := time.Duration(1 * time.Minute)
localStoragePath := cfg.agentStoragePath
remoteStorage := remote.NewStorage(log.With(logger, "component", "remote"), prometheus.DefaultRegisterer, localStorage.StartTime, localStoragePath, time.Duration(remoteFlushDeadline), scraper)
fanoutStorage := storage.NewFanout(logger, localStorage, remoteStorage)
scrapeManager := scrape.NewManager(&cfg.scrape, log.With(logger, "component", "scrape manager"), fanoutStorage)
scraper.Set(scrapeManager)
cfg.web.ListenAddress = coreconfig.Config.Prometheus.WebAddress
if len(cfg.web.ListenAddress) == 0 {
cfg.web.ListenAddress = "127.0.0.1:0"
}
cfg.web.ExternalURL, err = computeExternalURL(cfg.prometheusURL, cfg.web.ListenAddress)
if err != nil {
fmt.Fprintln(os.Stderr, fmt.Errorf("%s parse external URL %q", err, cfg.prometheusURL))
os.Exit(2)
}
if cfg.web.RoutePrefix == "" {
cfg.web.RoutePrefix = cfg.web.ExternalURL.Path
}
// RoutePrefix must always be at least '/'.
cfg.web.RoutePrefix = "/" + strings.Trim(cfg.web.RoutePrefix, "/")
ctxWeb, cancelWeb := context.WithCancel(context.Background())
cfg.web.Context = ctxWeb
cfg.web.TSDBRetentionDuration = cfg.tsdb.RetentionDuration
cfg.web.TSDBMaxBytes = cfg.tsdb.MaxBytes
cfg.web.TSDBDir = localStoragePath
cfg.web.LocalStorage = localStorage
cfg.web.Storage = fanoutStorage
cfg.web.ExemplarStorage = localStorage
cfg.web.ScrapeManager = scrapeManager
cfg.web.QueryEngine = nil
cfg.web.RuleManager = nil
cfg.web.Notifier = notifierManager
cfg.web.LookbackDelta = time.Duration(cfg.lookbackDelta)
cfg.web.IsAgent = true
webHandler := web.New(log.With(logger, "component", "web"), &cfg.web)
listener, err := webHandler.Listener()
if err != nil {
level.Error(logger).Log("msg", "Unable to start web listener", "err", err)
os.Exit(1)
}
reloaders := []reloader{
{
name: "remote_storage",
reloader: remoteStorage.ApplyConfig,
}, {
name: "web_handler",
reloader: webHandler.ApplyConfig,
}, {
// The Scrape and notifier managers need to reload before the Discovery manager as
// they need to read the most updated config when receiving the new targets list.
name: "scrape",
reloader: scrapeManager.ApplyConfig,
}, {
name: "scrape_sd",
reloader: func(cfg *config.Config) error {
c := make(map[string]discovery.Configs)
for _, v := range cfg.ScrapeConfigs {
c[v.JobName] = v.ServiceDiscoveryConfigs
}
return discoveryManagerScrape.ApplyConfig(c)
},
}, {
name: "notify",
reloader: notifierManager.ApplyConfig,
}, {
name: "notify_sd",
reloader: func(cfg *config.Config) error {
c := make(map[string]discovery.Configs)
for k, v := range cfg.AlertingConfig.AlertmanagerConfigs.ToMap() {
c[k] = v.ServiceDiscoveryConfigs
}
return discoveryManagerNotify.ApplyConfig(c)
},
},
}
dbOpen := make(chan struct{})
type closeOnce struct {
C chan struct{}
once sync.Once
Close func()
}
// Wait until the server is ready to handle reloading.
reloadReady := &closeOnce{
C: make(chan struct{}),
}
reloadReady.Close = func() {
reloadReady.once.Do(func() {
close(reloadReady.C)
})
}
var g run.Group
{
// Termination handler.
term := make(chan os.Signal, 1)
signal.Notify(term, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT, syscall.SIGPIPE)
cancel := make(chan struct{})
g.Add(
func() error {
// Don't forget to release the reloadReady channel so that waiting blocks can exit normally.
select {
case sig := <-term:
level.Warn(logger).Log("msg", "Received "+sig.String())
if sig != syscall.SIGPIPE {
level.Warn(logger).Log("msg", "exiting gracefully...")
reloadReady.Close()
}
case <-webHandler.Quit():
level.Warn(logger).Log("msg", "Received termination request via web service, exiting gracefully...")
case <-cancel:
reloadReady.Close()
case <-stop:
reloadReady.Close()
}
atomic.StoreInt32(&isRunning, 0)
return nil
},
func(err error) {
close(cancel)
},
)
}
{
// Scrape discovery manager.
g.Add(
func() error {
err := discoveryManagerScrape.Run()
level.Info(logger).Log("msg", "Scrape discovery manager stopped")
return err
},
func(err error) {
level.Info(logger).Log("msg", "Stopping scrape discovery manager...")
cancelScrape()
},
)
}
{
// Notify discovery manager.
g.Add(
func() error {
err := discoveryManagerNotify.Run()
level.Info(logger).Log("msg", "Notify discovery manager stopped")
return err
},
func(err error) {
level.Info(logger).Log("msg", "Stopping notify discovery manager...")
cancelNotify()
},
)
}
{
// Scrape manager.
g.Add(
func() error {
// When the scrape manager receives a new targets list
// it needs to read a valid config for each job.
// It depends on the config being in sync with the discovery manager so
// we wait until the config is fully loaded.
<-reloadReady.C
err := scrapeManager.Run(discoveryManagerScrape.SyncCh())
level.Info(logger).Log("msg", "Scrape manager stopped")
return err
},
func(err error) {
// Scrape manager needs to be stopped before closing the local TSDB
// so that it doesn't try to write samples to a closed storage.
level.Info(logger).Log("msg", "Stopping scrape manager...")
scrapeManager.Stop()
},
)
}
{
// Reload handler.
// Make sure that sighup handler is registered with a redirect to the channel before the potentially
// long and synchronous tsdb init.
hup := make(chan os.Signal, 1)
signal.Notify(hup, syscall.SIGHUP)
cancel := make(chan struct{})
g.Add(
func() error {
<-reloadReady.C
for {
select {
case <-hup:
if err := reloadConfig(cfg.configFile, cfg.enableExpandExternalLabels, cfg.tsdb.EnableExemplarStorage, logger, reloaders...); err != nil {
level.Error(logger).Log("msg", "Error reloading config", "err", err)
}
case rc := <-webHandler.Reload():
if err := reloadConfig(cfg.configFile, cfg.enableExpandExternalLabels, cfg.tsdb.EnableExemplarStorage, logger, reloaders...); err != nil {
level.Error(logger).Log("msg", "Error reloading config", "err", err)
rc <- err
} else {
rc <- nil
}
case <-cancel:
return nil
}
}
},
func(err error) {
// Wait for any in-progress reloads to complete to avoid
// reloading things after they have been shutdown.
cancel <- struct{}{}
},
)
}
{
// Initial configuration loading.
cancel := make(chan struct{})
g.Add(
func() error {
select {
case <-dbOpen:
// In case a shutdown is initiated before the dbOpen is released
case <-cancel:
reloadReady.Close()
return nil
}
if err := reloadConfig(cfg.configFile, cfg.enableExpandExternalLabels, cfg.tsdb.EnableExemplarStorage, logger, reloaders...); err != nil {
return fmt.Errorf("%s error loading config from %q", err, cfg.configFile)
}
reloadReady.Close()
webHandler.SetReady(true)
level.Info(logger).Log("msg", "Server is ready to receive web requests.")
<-cancel
return nil
},
func(err error) {
close(cancel)
},
)
}
{
// WAL storage.
opts := cfg.agent.ToAgentOptions()
cancel := make(chan struct{})
g.Add(
func() error {
level.Info(logger).Log("msg", "Starting WAL storage ...")
if cfg.agent.WALSegmentSize != 0 {
if cfg.agent.WALSegmentSize < 10*1024*1024 || cfg.agent.WALSegmentSize > 256*1024*1024 {
return errors.New("flag 'storage.agent.wal-segment-size' must be set between 10MB and 256MB")
}
}
db, err := agent.Open(
logger,
prometheus.DefaultRegisterer,
remoteStorage,
localStoragePath,
&opts,
)
if err != nil {
return fmt.Errorf("opening storage failed %s", err)
}
switch fsType := prom_runtime.Statfs(localStoragePath); fsType {
case "NFS_SUPER_MAGIC":
level.Warn(logger).Log("fs_type", fsType, "msg", "This filesystem is not supported and may lead to data corruption and data loss. Please carefully read https://prometheus.io/docs/prometheus/latest/storage/ to learn more about supported filesystems.")
default:
level.Info(logger).Log("fs_type", fsType)
}
level.Info(logger).Log("msg", "Agent WAL storage started")
level.Debug(logger).Log("msg", "Agent WAL storage options",
"WALSegmentSize", cfg.agent.WALSegmentSize,
"WALCompression", cfg.agent.WALCompression,
"StripeSize", cfg.agent.StripeSize,
"TruncateFrequency", cfg.agent.TruncateFrequency,
"MinWALTime", cfg.agent.MinWALTime,
"MaxWALTime", cfg.agent.MaxWALTime,
)
localStorage.Set(db, 0)
close(dbOpen)
<-cancel
return nil
},
func(e error) {
if err := fanoutStorage.Close(); err != nil {
level.Error(logger).Log("msg", "Error stopping storage", "err", err)
}
close(cancel)
},
)
}
{
// Web handler.
g.Add(
func() error {
if err := webHandler.Run(ctxWeb, listener, ""); err != nil {
return fmt.Errorf("%s error starting web server", err)
}
return nil
},
func(err error) {
cancelWeb()
},
)
}
{
// Notifier.
// Calling notifier.Stop() before ruleManager.Stop() will cause a panic if the ruleManager isn't running,
// so keep this interrupt after the ruleManager.Stop().
g.Add(
func() error {
// When the notifier manager receives a new targets list
// it needs to read a valid config for each job.
// It depends on the config being in sync with the discovery manager
// so we wait until the config is fully loaded.
<-reloadReady.C
notifierManager.Run(discoveryManagerNotify.SyncCh())
level.Info(logger).Log("msg", "Notifier manager stopped")
return nil
},
func(err error) {
notifierManager.Stop()
},
)
}
atomic.StoreInt32(&isRunning, 1)
if err := g.Run(); err != nil {
level.Error(logger).Log("err", err)
os.Exit(1)
}
level.Info(logger).Log("msg", "See you next time!")
}
func Stop() {
// if stop != nil {
// close(stop)
// }
}