448 lines
13 KiB
Go
448 lines
13 KiB
Go
// Package lightstep implements the LightStep OpenTracing client for Go.
|
|
package lightstep
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"runtime"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/opentracing/opentracing-go"
|
|
)
|
|
|
|
// Tracer extends the `opentracing.Tracer` interface with methods for manual
|
|
// flushing and closing. To access these methods, you can take the global
|
|
// tracer and typecast it to a `lightstep.Tracer`. As a convenience, the
|
|
// lightstep package provides static functions which perform the typecasting.
|
|
type Tracer interface {
|
|
opentracing.Tracer
|
|
|
|
// Close flushes and then terminates the LightStep collector
|
|
Close(context.Context)
|
|
// Flush sends all spans currently in the buffer to the LighStep collector
|
|
Flush(context.Context)
|
|
// Options gets the Options used in New() or NewWithOptions().
|
|
Options() Options
|
|
// Disable prevents the tracer from recording spans or flushing
|
|
Disable()
|
|
}
|
|
|
|
// Implements the `Tracer` interface. Buffers spans and forwards to a Lightstep collector.
|
|
type tracerImpl struct {
|
|
//////////////////////////////////////////////////////////////
|
|
// IMMUTABLE IMMUTABLE IMMUTABLE IMMUTABLE IMMUTABLE IMMUTABLE
|
|
//////////////////////////////////////////////////////////////
|
|
|
|
// Note: there may be a desire to update some of these fields
|
|
// at runtime, in which case suitable changes may be needed
|
|
// for variables accessed during Flush.
|
|
|
|
reporterID uint64 // the LightStep tracer guid
|
|
opts Options
|
|
|
|
// report loop management
|
|
closeOnce sync.Once
|
|
closeReportLoopChannel chan struct{}
|
|
reportLoopClosedChannel chan struct{}
|
|
|
|
converter *protoConverter
|
|
accessToken string
|
|
attributes map[string]string
|
|
|
|
//////////////////////////////////////////////////////////
|
|
// MUTABLE MUTABLE MUTABLE MUTABLE MUTABLE MUTABLE MUTABLE
|
|
//////////////////////////////////////////////////////////
|
|
|
|
// the following fields are modified under `lock`.
|
|
lock sync.Mutex
|
|
|
|
// Remote service that will receive reports.
|
|
client collectorClient
|
|
connection Connection
|
|
|
|
// Two buffers of data.
|
|
buffer reportBuffer
|
|
flushing reportBuffer
|
|
|
|
// Flush state.
|
|
flushingLock sync.Mutex
|
|
reportInFlight bool
|
|
lastReportAttempt time.Time
|
|
|
|
// Meta Event Reporting can be enabled at tracer creation or on-demand by satellite
|
|
metaEventReportingEnabled bool
|
|
// Set to true on first report
|
|
firstReportHasRun bool
|
|
|
|
// We allow our remote peer to disable this instrumentation at any
|
|
// time, turning all potentially costly runtime operations into
|
|
// no-ops.
|
|
//
|
|
// TODO this should use atomic load/store to test disabled
|
|
// prior to taking the lock, do please.
|
|
disabled bool
|
|
|
|
// Map of propagators used to determine the correct propagator to use
|
|
// based on the format passed into Inject/Extract. Supports one
|
|
// propagator for each of the formats: TextMap, HTTPHeaders, Binary
|
|
propagators map[opentracing.BuiltinFormat]Propagator
|
|
}
|
|
|
|
// NewTracer creates and starts a new Lightstep Tracer.
|
|
// In case of error, we emit event and return nil.
|
|
func NewTracer(opts Options) Tracer {
|
|
tr, err := CreateTracer(opts)
|
|
if err != nil {
|
|
emitEvent(newEventStartError(err))
|
|
return nil
|
|
}
|
|
return tr
|
|
}
|
|
|
|
// CreateTracer creates and starts a new Lightstep Tracer.
|
|
// It is meant to replace NewTracer which does not propagate the error.
|
|
func CreateTracer(opts Options) (Tracer, error) {
|
|
if err := opts.Initialize(); err != nil {
|
|
return nil, fmt.Errorf("init; err: %v", err)
|
|
}
|
|
|
|
attributes := map[string]string{}
|
|
for k, v := range opts.Tags {
|
|
attributes[k] = fmt.Sprint(v)
|
|
}
|
|
// Don't let the GrpcOptions override these values. That would be confusing.
|
|
attributes[TracerPlatformKey] = TracerPlatformValue
|
|
attributes[TracerPlatformVersionKey] = runtime.Version()
|
|
attributes[TracerVersionKey] = TracerVersionValue
|
|
|
|
now := time.Now()
|
|
impl := &tracerImpl{
|
|
opts: opts,
|
|
reporterID: genSeededGUID(),
|
|
buffer: newSpansBuffer(opts.MaxBufferedSpans),
|
|
flushing: newSpansBuffer(opts.MaxBufferedSpans),
|
|
closeReportLoopChannel: make(chan struct{}),
|
|
reportLoopClosedChannel: make(chan struct{}),
|
|
converter: newProtoConverter(opts),
|
|
accessToken: opts.AccessToken,
|
|
attributes: attributes,
|
|
}
|
|
|
|
impl.buffer.setCurrent(now)
|
|
|
|
var err error
|
|
impl.client, err = newCollectorClient(opts)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("create collector client; err: %v", err)
|
|
}
|
|
|
|
conn, err := impl.client.ConnectClient()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
impl.connection = conn
|
|
|
|
// set meta reporting to defined option
|
|
impl.metaEventReportingEnabled = opts.MetaEventReportingEnabled
|
|
impl.firstReportHasRun = false
|
|
|
|
go impl.reportLoop()
|
|
|
|
impl.propagators = map[opentracing.BuiltinFormat]Propagator{
|
|
opentracing.TextMap: LightStepPropagator,
|
|
opentracing.HTTPHeaders: LightStepPropagator,
|
|
opentracing.Binary: BinaryPropagator,
|
|
}
|
|
for builtin, propagator := range opts.Propagators {
|
|
impl.propagators[builtin] = propagator
|
|
}
|
|
|
|
return impl, nil
|
|
}
|
|
|
|
func (tracer *tracerImpl) Options() Options {
|
|
return tracer.opts
|
|
}
|
|
|
|
func (tracer *tracerImpl) StartSpan(
|
|
operationName string,
|
|
sso ...opentracing.StartSpanOption,
|
|
) opentracing.Span {
|
|
return newSpan(operationName, tracer, sso)
|
|
}
|
|
|
|
func (tracer *tracerImpl) Inject(sc opentracing.SpanContext, format interface{}, carrier interface{}) error {
|
|
if tracer.opts.MetaEventReportingEnabled {
|
|
opentracing.StartSpan(LSMetaEvent_InjectOperation,
|
|
opentracing.Tag{Key: LSMetaEvent_MetaEventKey, Value: true},
|
|
opentracing.Tag{Key: LSMetaEvent_TraceIdKey, Value: sc.(SpanContext).TraceID},
|
|
opentracing.Tag{Key: LSMetaEvent_SpanIdKey, Value: sc.(SpanContext).SpanID},
|
|
opentracing.Tag{Key: LSMetaEvent_PropagationFormatKey, Value: format}).
|
|
Finish()
|
|
}
|
|
|
|
builtin, ok := format.(opentracing.BuiltinFormat)
|
|
if !ok {
|
|
return opentracing.ErrUnsupportedFormat
|
|
}
|
|
return tracer.propagators[builtin].Inject(sc, carrier)
|
|
}
|
|
|
|
func (tracer *tracerImpl) Extract(format interface{}, carrier interface{}) (opentracing.SpanContext, error) {
|
|
if tracer.opts.MetaEventReportingEnabled {
|
|
opentracing.StartSpan(LSMetaEvent_ExtractOperation,
|
|
opentracing.Tag{Key: LSMetaEvent_MetaEventKey, Value: true},
|
|
opentracing.Tag{Key: LSMetaEvent_PropagationFormatKey, Value: format}).
|
|
Finish()
|
|
}
|
|
builtin, ok := format.(opentracing.BuiltinFormat)
|
|
if !ok {
|
|
return nil, opentracing.ErrUnsupportedFormat
|
|
}
|
|
return tracer.propagators[builtin].Extract(carrier)
|
|
}
|
|
|
|
func (tracer *tracerImpl) reconnectClient(now time.Time) {
|
|
conn, err := tracer.client.ConnectClient()
|
|
if err != nil {
|
|
emitEvent(newEventConnectionError(err))
|
|
} else {
|
|
tracer.lock.Lock()
|
|
oldConn := tracer.connection
|
|
tracer.connection = conn
|
|
tracer.lock.Unlock()
|
|
|
|
oldConn.Close()
|
|
}
|
|
}
|
|
|
|
// Close flushes and then terminates the LightStep collector. Close may only be
|
|
// called once; subsequent calls to Close are no-ops.
|
|
func (tracer *tracerImpl) Close(ctx context.Context) {
|
|
tracer.closeOnce.Do(func() {
|
|
// notify report loop that we are closing
|
|
close(tracer.closeReportLoopChannel)
|
|
select {
|
|
case <-tracer.reportLoopClosedChannel:
|
|
tracer.Flush(ctx)
|
|
case <-ctx.Done():
|
|
return
|
|
}
|
|
|
|
// now its safe to close the connection
|
|
tracer.lock.Lock()
|
|
conn := tracer.connection
|
|
tracer.connection = nil
|
|
tracer.lock.Unlock()
|
|
|
|
if conn != nil {
|
|
err := conn.Close()
|
|
if err != nil {
|
|
emitEvent(newEventConnectionError(err))
|
|
}
|
|
}
|
|
})
|
|
}
|
|
|
|
// RecordSpan records a finished Span.
|
|
func (tracer *tracerImpl) RecordSpan(raw RawSpan) {
|
|
tracer.lock.Lock()
|
|
|
|
// Early-out for disabled runtimes
|
|
if tracer.disabled {
|
|
tracer.lock.Unlock()
|
|
return
|
|
}
|
|
|
|
tracer.buffer.addSpan(raw)
|
|
tracer.lock.Unlock()
|
|
|
|
if tracer.opts.Recorder != nil {
|
|
tracer.opts.Recorder.RecordSpan(raw)
|
|
}
|
|
}
|
|
|
|
// Flush sends all buffered data to the collector.
|
|
func (tracer *tracerImpl) Flush(ctx context.Context) {
|
|
tracer.flushingLock.Lock()
|
|
defer tracer.flushingLock.Unlock()
|
|
|
|
if errorEvent := tracer.preFlush(); errorEvent != nil {
|
|
emitEvent(errorEvent)
|
|
return
|
|
}
|
|
|
|
if tracer.opts.MetaEventReportingEnabled && !tracer.firstReportHasRun {
|
|
opentracing.StartSpan(LSMetaEvent_TracerCreateOperation,
|
|
opentracing.Tag{Key: LSMetaEvent_MetaEventKey, Value: true},
|
|
opentracing.Tag{Key: LSMetaEvent_TracerGuidKey, Value: tracer.reporterID}).
|
|
Finish()
|
|
tracer.firstReportHasRun = true
|
|
}
|
|
|
|
ctx, cancel := context.WithTimeout(ctx, tracer.opts.ReportTimeout)
|
|
defer cancel()
|
|
|
|
protoReq := tracer.converter.toReportRequest(
|
|
tracer.reporterID,
|
|
tracer.attributes,
|
|
tracer.accessToken,
|
|
&tracer.flushing,
|
|
)
|
|
req, err := tracer.client.Translate(protoReq)
|
|
if err != nil {
|
|
errorEvent := newEventFlushError(err, FlushErrorTranslate)
|
|
emitEvent(errorEvent)
|
|
// call postflush to prevent the tracer from going into an invalid state.
|
|
emitEvent(tracer.postFlush(errorEvent))
|
|
return
|
|
}
|
|
|
|
var reportErrorEvent *eventFlushError
|
|
resp, err := tracer.client.Report(ctx, req)
|
|
if err != nil {
|
|
reportErrorEvent = newEventFlushError(err, FlushErrorTransport)
|
|
} else if len(resp.GetErrors()) > 0 {
|
|
reportErrorEvent = newEventFlushError(fmt.Errorf(resp.GetErrors()[0]), FlushErrorReport)
|
|
}
|
|
|
|
if reportErrorEvent != nil {
|
|
emitEvent(reportErrorEvent)
|
|
}
|
|
emitEvent(tracer.postFlush(reportErrorEvent))
|
|
|
|
if err == nil && resp.DevMode() {
|
|
tracer.metaEventReportingEnabled = true
|
|
}
|
|
|
|
if err == nil && !resp.DevMode() {
|
|
tracer.metaEventReportingEnabled = false
|
|
}
|
|
|
|
if err == nil && resp.Disable() {
|
|
tracer.Disable()
|
|
}
|
|
}
|
|
|
|
// preFlush handles lock-protected data manipulation before flushing
|
|
func (tracer *tracerImpl) preFlush() *eventFlushError {
|
|
tracer.lock.Lock()
|
|
defer tracer.lock.Unlock()
|
|
|
|
if tracer.disabled {
|
|
return newEventFlushError(errFlushFailedTracerClosed, FlushErrorTracerDisabled)
|
|
}
|
|
|
|
if tracer.connection == nil {
|
|
return newEventFlushError(errFlushFailedTracerClosed, FlushErrorTracerClosed)
|
|
}
|
|
|
|
now := time.Now()
|
|
tracer.buffer, tracer.flushing = tracer.flushing, tracer.buffer
|
|
tracer.reportInFlight = true
|
|
tracer.flushing.setFlushing(now)
|
|
tracer.buffer.setCurrent(now)
|
|
tracer.lastReportAttempt = now
|
|
return nil
|
|
}
|
|
|
|
// postFlush handles lock-protected data manipulation after flushing
|
|
func (tracer *tracerImpl) postFlush(flushEventError *eventFlushError) *eventStatusReport {
|
|
tracer.lock.Lock()
|
|
defer tracer.lock.Unlock()
|
|
|
|
tracer.reportInFlight = false
|
|
|
|
statusReportEvent := newEventStatusReport(
|
|
tracer.flushing.reportStart,
|
|
tracer.flushing.reportEnd,
|
|
len(tracer.flushing.rawSpans),
|
|
int(tracer.flushing.droppedSpanCount+tracer.buffer.droppedSpanCount),
|
|
int(tracer.flushing.logEncoderErrorCount+tracer.buffer.logEncoderErrorCount),
|
|
)
|
|
|
|
if flushEventError == nil {
|
|
tracer.flushing.clear()
|
|
return statusReportEvent
|
|
}
|
|
|
|
switch flushEventError.State() {
|
|
case FlushErrorTranslate:
|
|
// When there's a translation error, we do not want to retry.
|
|
tracer.flushing.clear()
|
|
default:
|
|
// Restore the records that did not get sent correctly
|
|
tracer.buffer.mergeFrom(&tracer.flushing)
|
|
}
|
|
|
|
statusReportEvent.SetSentSpans(0)
|
|
|
|
return statusReportEvent
|
|
}
|
|
|
|
func (tracer *tracerImpl) Disable() {
|
|
tracer.lock.Lock()
|
|
if tracer.disabled {
|
|
tracer.lock.Unlock()
|
|
return
|
|
}
|
|
tracer.disabled = true
|
|
tracer.buffer.clear()
|
|
tracer.lock.Unlock()
|
|
|
|
emitEvent(newEventTracerDisabled())
|
|
}
|
|
|
|
// Every MinReportingPeriod the reporting loop wakes up and checks to see if
|
|
// either (a) the Runtime's max reporting period is about to expire (see
|
|
// maxReportingPeriod()), (b) the number of buffered log records is
|
|
// approaching kMaxBufferedLogs, or if (c) the number of buffered span records
|
|
// is approaching kMaxBufferedSpans. If any of those conditions are true,
|
|
// pending data is flushed to the remote peer. If not, the reporting loop waits
|
|
// until the next cycle. See Runtime.maybeFlush() for details.
|
|
//
|
|
// This could alternatively be implemented using flush channels and so forth,
|
|
// but that would introduce opportunities for client code to block on the
|
|
// runtime library, and we want to avoid that at all costs (even dropping data,
|
|
// which can certainly happen with high data rates and/or unresponsive remote
|
|
// peers).
|
|
|
|
func (tracer *tracerImpl) shouldFlushLocked(now time.Time) bool {
|
|
if now.Add(tracer.opts.MinReportingPeriod).Sub(tracer.lastReportAttempt) > tracer.opts.ReportingPeriod {
|
|
return true
|
|
} else if tracer.buffer.isHalfFull() {
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
func (tracer *tracerImpl) reportLoop() {
|
|
tickerChan := time.Tick(tracer.opts.MinReportingPeriod)
|
|
for {
|
|
select {
|
|
case <-tickerChan:
|
|
now := time.Now()
|
|
|
|
tracer.lock.Lock()
|
|
disabled := tracer.disabled
|
|
reconnect := !tracer.reportInFlight && tracer.client.ShouldReconnect()
|
|
shouldFlush := tracer.shouldFlushLocked(now)
|
|
tracer.lock.Unlock()
|
|
|
|
if disabled {
|
|
return
|
|
}
|
|
if shouldFlush {
|
|
tracer.Flush(context.Background())
|
|
}
|
|
if reconnect {
|
|
tracer.reconnectClient(now)
|
|
}
|
|
case <-tracer.closeReportLoopChannel:
|
|
close(tracer.reportLoopClosedChannel)
|
|
return
|
|
}
|
|
}
|
|
}
|