diff options
| author | Paul Buetow <paul@buetow.org> | 2021-09-12 19:04:42 +0300 |
|---|---|---|
| committer | Paul Buetow <paul@buetow.org> | 2021-10-02 12:26:29 +0300 |
| commit | 2ebe7e9d63ba62c6f19749c39fe0a577d86ca775 (patch) | |
| tree | 2ae6d11a3cbc82152085a9d7755adef436b3ce46 /internal/mapr | |
| parent | 842fd5800000bb68d6306a9ecad80a98ed762a2f (diff) | |
bugfix: dmap skipped the last couple of mapreduce lines
Diffstat (limited to 'internal/mapr')
| -rw-r--r-- | internal/mapr/server/aggregate.go | 102 |
1 files changed, 42 insertions, 60 deletions
diff --git a/internal/mapr/server/aggregate.go b/internal/mapr/server/aggregate.go index a6d6bb1..d11ed7d 100644 --- a/internal/mapr/server/aggregate.go +++ b/internal/mapr/server/aggregate.go @@ -19,16 +19,12 @@ import ( // Aggregate is for aggregating mapreduce data on the DTail server side. type Aggregate struct { done *internal.Done - // Log lines to process (parsing MAPREDUCE lines). - Lines chan line.Line + // NextLinesCh can be used to use a new line ch. + NextLinesCh chan chan line.Line // Hostname of the current server (used to populate $hostname field). hostname string // Signals to serialize data. serialize chan struct{} - // Signals to flush data. - flush chan struct{} - // Signals that data has been flushed - flushed chan struct{} // The mapr query query *mapr.Query // The mapr log format parser @@ -69,14 +65,12 @@ func NewAggregate(queryStr string) (*Aggregate, error) { } a := Aggregate{ - done: internal.NewDone(), - Lines: make(chan line.Line, 100), - serialize: make(chan struct{}), - flush: make(chan struct{}), - flushed: make(chan struct{}), - hostname: s[0], - query: query, - parser: logParser, + done: internal.NewDone(), + NextLinesCh: make(chan chan line.Line, 10), + serialize: make(chan struct{}), + hostname: s[0], + query: query, + parser: logParser, } return &a, nil @@ -84,12 +78,11 @@ func NewAggregate(queryStr string) (*Aggregate, error) { // Shutdown the aggregation engine. func (a *Aggregate) Shutdown() { - a.Flush() a.done.Shutdown() } // Start an aggregation. -func (a *Aggregate) Start(ctx context.Context, maprLines chan<- string) { +func (a *Aggregate) Start(ctx context.Context, maprMessages chan<- string) { myCtx, cancel := context.WithCancel(ctx) defer cancel() @@ -102,16 +95,16 @@ func (a *Aggregate) Start(ctx context.Context, maprLines chan<- string) { } }() - fieldsCh := a.makeFields(myCtx) + fieldsCh := a.fieldsFromLines(myCtx) // Add fields (e.g. via 'set' clause) if len(a.query.Set) > 0 { - fieldsCh = a.addFields(myCtx, fieldsCh) + fieldsCh = a.setAdditionalFields(myCtx, fieldsCh) } // Periodically pre-aggregate data every a.query.Interval seconds. go a.aggregateTimer(myCtx) - a.makeMaprLines(myCtx, fieldsCh, maprLines) + a.aggregateAndSerialize(myCtx, fieldsCh, maprMessages) } func (a *Aggregate) aggregateTimer(ctx context.Context) { @@ -125,23 +118,38 @@ func (a *Aggregate) aggregateTimer(ctx context.Context) { } } -func (a *Aggregate) makeFields(ctx context.Context) <-chan map[string]string { - ch := make(chan map[string]string) +func (a *Aggregate) fieldsFromLines(ctx context.Context) <-chan map[string]string { + fieldsCh := make(chan map[string]string) go func() { - defer close(ch) + defer close(fieldsCh) + var lines chan line.Line + + // Gather first lines channel (first input file) + select { + case lines = <-a.NextLinesCh: + case <-ctx.Done(): + return + } for { select { - case line, ok := <-a.Lines: + case line, ok := <-lines: if !ok { - return + select { + case lines = <-a.NextLinesCh: + // Have a new lines channel (e.g. new input file) + case <-ctx.Done(): + default: + // No new lines channel found. + return + } } maprLine := strings.TrimSpace(line.Content.String()) + fields, err := a.parser.MakeFields(maprLine) pool.RecycleBytesBuffer(line.Content) - fields, err := a.parser.MakeFields(maprLine) if err != nil { // Should fields be ignored anyway? if err != logformat.IgnoreFieldsErr { @@ -155,7 +163,7 @@ func (a *Aggregate) makeFields(ctx context.Context) <-chan map[string]string { } select { - case ch <- fields: + case fieldsCh <- fields: case <-ctx.Done(): } case <-ctx.Done(): @@ -164,17 +172,16 @@ func (a *Aggregate) makeFields(ctx context.Context) <-chan map[string]string { } }() - return ch + return fieldsCh } -func (a *Aggregate) addFields(ctx context.Context, fieldsCh <-chan map[string]string) <-chan map[string]string { - ch := make(chan map[string]string) +func (a *Aggregate) setAdditionalFields(ctx context.Context, fieldsCh <-chan map[string]string) <-chan map[string]string { + newFieldsCh := make(chan map[string]string) go func() { - defer close(ch) + defer close(newFieldsCh) for { - // fieldsCh will be closed via 'makeFields' when ctx is done fields, ok := <-fieldsCh if !ok { return @@ -184,23 +191,22 @@ func (a *Aggregate) addFields(ctx context.Context, fieldsCh <-chan map[string]st } select { - case ch <- fields: + case newFieldsCh <- fields: case <-ctx.Done(): } } }() - return ch + return newFieldsCh } -func (a *Aggregate) makeMaprLines(ctx context.Context, fieldsCh <-chan map[string]string, maprLines chan<- string) { +func (a *Aggregate) aggregateAndSerialize(ctx context.Context, fieldsCh <-chan map[string]string, maprMessages chan<- string) { group := mapr.NewGroupSet() serialize := func() { logger.Info("Serializing mapreduce result") - group.Serialize(ctx, maprLines) + group.Serialize(ctx, maprMessages) group = mapr.NewGroupSet() - logger.Info("Done serializing mapreduce result") } for { @@ -213,9 +219,6 @@ func (a *Aggregate) makeMaprLines(ctx context.Context, fieldsCh <-chan map[strin a.aggregate(group, fields) case <-a.serialize: serialize() - case <-a.flush: - serialize() - a.flushed <- struct{}{} case <-ctx.Done(): return } @@ -264,24 +267,3 @@ func (a *Aggregate) Serialize(ctx context.Context) { case <-ctx.Done(): } } - -// Flush all data. -func (a *Aggregate) Flush() { - select { - case a.flush <- struct{}{}: - logger.Info("Flushing mapreduce data") - case <-time.After(time.Minute): - logger.Warn("Starting to flush mapreduce data takes over a minute") - return - case <-a.done.Done(): - return - } - - select { - case <-a.flushed: - logger.Info("Done flushing") - case <-time.After(time.Minute): - logger.Warn("Waiting for data to be flushed takes over a minute") - case <-a.done.Done(): - } -} |
