summaryrefslogtreecommitdiff
path: root/internal/mapr
diff options
context:
space:
mode:
authorPaul Buetow <paul@buetow.org>2021-09-12 19:04:42 +0300
committerPaul Buetow <paul@buetow.org>2021-10-02 12:26:29 +0300
commit2ebe7e9d63ba62c6f19749c39fe0a577d86ca775 (patch)
tree2ae6d11a3cbc82152085a9d7755adef436b3ce46 /internal/mapr
parent842fd5800000bb68d6306a9ecad80a98ed762a2f (diff)
bugfix: dmap skipped the last couple of mapreduce lines
Diffstat (limited to 'internal/mapr')
-rw-r--r--internal/mapr/server/aggregate.go102
1 files changed, 42 insertions, 60 deletions
diff --git a/internal/mapr/server/aggregate.go b/internal/mapr/server/aggregate.go
index a6d6bb1..d11ed7d 100644
--- a/internal/mapr/server/aggregate.go
+++ b/internal/mapr/server/aggregate.go
@@ -19,16 +19,12 @@ import (
// Aggregate is for aggregating mapreduce data on the DTail server side.
type Aggregate struct {
done *internal.Done
- // Log lines to process (parsing MAPREDUCE lines).
- Lines chan line.Line
+ // NextLinesCh can be used to use a new line ch.
+ NextLinesCh chan chan line.Line
// Hostname of the current server (used to populate $hostname field).
hostname string
// Signals to serialize data.
serialize chan struct{}
- // Signals to flush data.
- flush chan struct{}
- // Signals that data has been flushed
- flushed chan struct{}
// The mapr query
query *mapr.Query
// The mapr log format parser
@@ -69,14 +65,12 @@ func NewAggregate(queryStr string) (*Aggregate, error) {
}
a := Aggregate{
- done: internal.NewDone(),
- Lines: make(chan line.Line, 100),
- serialize: make(chan struct{}),
- flush: make(chan struct{}),
- flushed: make(chan struct{}),
- hostname: s[0],
- query: query,
- parser: logParser,
+ done: internal.NewDone(),
+ NextLinesCh: make(chan chan line.Line, 10),
+ serialize: make(chan struct{}),
+ hostname: s[0],
+ query: query,
+ parser: logParser,
}
return &a, nil
@@ -84,12 +78,11 @@ func NewAggregate(queryStr string) (*Aggregate, error) {
// Shutdown the aggregation engine.
func (a *Aggregate) Shutdown() {
- a.Flush()
a.done.Shutdown()
}
// Start an aggregation.
-func (a *Aggregate) Start(ctx context.Context, maprLines chan<- string) {
+func (a *Aggregate) Start(ctx context.Context, maprMessages chan<- string) {
myCtx, cancel := context.WithCancel(ctx)
defer cancel()
@@ -102,16 +95,16 @@ func (a *Aggregate) Start(ctx context.Context, maprLines chan<- string) {
}
}()
- fieldsCh := a.makeFields(myCtx)
+ fieldsCh := a.fieldsFromLines(myCtx)
// Add fields (e.g. via 'set' clause)
if len(a.query.Set) > 0 {
- fieldsCh = a.addFields(myCtx, fieldsCh)
+ fieldsCh = a.setAdditionalFields(myCtx, fieldsCh)
}
// Periodically pre-aggregate data every a.query.Interval seconds.
go a.aggregateTimer(myCtx)
- a.makeMaprLines(myCtx, fieldsCh, maprLines)
+ a.aggregateAndSerialize(myCtx, fieldsCh, maprMessages)
}
func (a *Aggregate) aggregateTimer(ctx context.Context) {
@@ -125,23 +118,38 @@ func (a *Aggregate) aggregateTimer(ctx context.Context) {
}
}
-func (a *Aggregate) makeFields(ctx context.Context) <-chan map[string]string {
- ch := make(chan map[string]string)
+func (a *Aggregate) fieldsFromLines(ctx context.Context) <-chan map[string]string {
+ fieldsCh := make(chan map[string]string)
go func() {
- defer close(ch)
+ defer close(fieldsCh)
+ var lines chan line.Line
+
+ // Gather first lines channel (first input file)
+ select {
+ case lines = <-a.NextLinesCh:
+ case <-ctx.Done():
+ return
+ }
for {
select {
- case line, ok := <-a.Lines:
+ case line, ok := <-lines:
if !ok {
- return
+ select {
+ case lines = <-a.NextLinesCh:
+ // Have a new lines channel (e.g. new input file)
+ case <-ctx.Done():
+ default:
+ // No new lines channel found.
+ return
+ }
}
maprLine := strings.TrimSpace(line.Content.String())
+ fields, err := a.parser.MakeFields(maprLine)
pool.RecycleBytesBuffer(line.Content)
- fields, err := a.parser.MakeFields(maprLine)
if err != nil {
// Should fields be ignored anyway?
if err != logformat.IgnoreFieldsErr {
@@ -155,7 +163,7 @@ func (a *Aggregate) makeFields(ctx context.Context) <-chan map[string]string {
}
select {
- case ch <- fields:
+ case fieldsCh <- fields:
case <-ctx.Done():
}
case <-ctx.Done():
@@ -164,17 +172,16 @@ func (a *Aggregate) makeFields(ctx context.Context) <-chan map[string]string {
}
}()
- return ch
+ return fieldsCh
}
-func (a *Aggregate) addFields(ctx context.Context, fieldsCh <-chan map[string]string) <-chan map[string]string {
- ch := make(chan map[string]string)
+func (a *Aggregate) setAdditionalFields(ctx context.Context, fieldsCh <-chan map[string]string) <-chan map[string]string {
+ newFieldsCh := make(chan map[string]string)
go func() {
- defer close(ch)
+ defer close(newFieldsCh)
for {
- // fieldsCh will be closed via 'makeFields' when ctx is done
fields, ok := <-fieldsCh
if !ok {
return
@@ -184,23 +191,22 @@ func (a *Aggregate) addFields(ctx context.Context, fieldsCh <-chan map[string]st
}
select {
- case ch <- fields:
+ case newFieldsCh <- fields:
case <-ctx.Done():
}
}
}()
- return ch
+ return newFieldsCh
}
-func (a *Aggregate) makeMaprLines(ctx context.Context, fieldsCh <-chan map[string]string, maprLines chan<- string) {
+func (a *Aggregate) aggregateAndSerialize(ctx context.Context, fieldsCh <-chan map[string]string, maprMessages chan<- string) {
group := mapr.NewGroupSet()
serialize := func() {
logger.Info("Serializing mapreduce result")
- group.Serialize(ctx, maprLines)
+ group.Serialize(ctx, maprMessages)
group = mapr.NewGroupSet()
- logger.Info("Done serializing mapreduce result")
}
for {
@@ -213,9 +219,6 @@ func (a *Aggregate) makeMaprLines(ctx context.Context, fieldsCh <-chan map[strin
a.aggregate(group, fields)
case <-a.serialize:
serialize()
- case <-a.flush:
- serialize()
- a.flushed <- struct{}{}
case <-ctx.Done():
return
}
@@ -264,24 +267,3 @@ func (a *Aggregate) Serialize(ctx context.Context) {
case <-ctx.Done():
}
}
-
-// Flush all data.
-func (a *Aggregate) Flush() {
- select {
- case a.flush <- struct{}{}:
- logger.Info("Flushing mapreduce data")
- case <-time.After(time.Minute):
- logger.Warn("Starting to flush mapreduce data takes over a minute")
- return
- case <-a.done.Done():
- return
- }
-
- select {
- case <-a.flushed:
- logger.Info("Done flushing")
- case <-time.After(time.Minute):
- logger.Warn("Waiting for data to be flushed takes over a minute")
- case <-a.done.Done():
- }
-}