filters: add package for filter syntax

With this PR, we add the syntax to use for filtration of items over the containerd API. This package defines a syntax and parser that can be used across types and use cases in a uniform manner. The syntax is fairly familiar, if you've used container ecosystem projects. At the core, we base it on the concept of protobuf field paths, augmenting with the ability to quote portions of the field path to match arbitrary labels. These "selectors" come in the following syntax: ``` <fieldpath>[<operator><value>] ``` A basic example is as follows: ``` name=foo ``` This would match all objects that have a field `name` with the value `foo`. If we only want to test if the field is present, we can omit the operator. This is most useful for matching labels in containerd. The following will match objects that has the field labels and have the label "foo" defined: ``` labels.foo ``` We also allow for quoting of parts of the field path to allow matching of arbitrary items: ``` labels."very complex label"==something ``` We also define `!=` and `~=` as operators. The `!=` operator will match all objects that don't match the value for a field and `~=` will compile the target value as a regular expression and match the field value against that. Selectors can be combined using a comma, such that the resulting selector will require all selectors are matched for the object to match. The following example will match objects that are named `foo` and have the label `bar`: ``` name==foo,labels.bar ``` This filter syntax will be used across all APIs that allow listing of objects and for filtering which event a cleint see. By using a common syntax, we hope to keep API access uniform. For the most part, this takes inspiration from docker, swarm and k8s, but has the limitation that it only allows selection of an inner product. We may expand to operators that implement `or`, `in` or `notin`, but it is not clear that this is useful at this level of the stack. Signed-off-by: Stephen J Day <stephen.day@docker.com>
2017-06-12 17:55:13 -07:00 · 2017-06-12 17:55:13 -07:00 · d69ef98bfd
commit d69ef98bfd
parent 7e3b7dead6
6 changed files with 1170 additions and 0 deletions
--- a/filters/adaptor.go
+++ b/filters/adaptor.go
@ -0,0 +1,15 @@
+package filters
+
+// Adaptor specifies the mapping of fieldpaths to a type. For the given field
+// path, the value and whether it is present should be returned. The mapping of
+// the fieldpath to a field is deferred to the adaptor implementation, but
+// should generally follow protobuf field path/mask semantics.
+type Adaptor interface {
+	Field(fieldpath []string) (value string, present bool)
+}
+
+type AdapterFunc func(fieldpath []string) (string, bool)
+
+func (fn AdapterFunc) Field(fieldpath []string) (string, bool) {
+	return fn(fieldpath)
+}
--- a/filters/filter.go
+++ b/filters/filter.go
@ -0,0 +1,155 @@
+// Package filters defines a syntax and parser that can be used for the
+// filtration of items across the containerd API. The core is built on the
+// concept of protobuf field paths, with quoting.  Several operators allow the
+// user to flexibly select items based on field presence, equality, inequality
+// and regular expressions. Flexible adaptors support working with any type.
+//
+// The syntax is fairly familiar, if you've used container ecosystem
+// projects.  At the core, we base it on the concept of protobuf field
+// paths, augmenting with the ability to quote portions of the field path
+// to match arbitrary labels. These "selectors" come in the following
+// syntax:
+//
+// ```
+// <fieldpath>[<operator><value>]
+// ```
+//
+// A basic example is as follows:
+//
+// ```
+// name==foo
+// ```
+//
+// This would match all objects that have a field `name` with the value
+// `foo`. If we only want to test if the field is present, we can omit the
+// operator. This is most useful for matching labels in containerd. The
+// following will match objects that have the field "labels" and have the
+// label "foo" defined:
+//
+// ```
+// labels.foo
+// ```
+//
+// We also allow for quoting of parts of the field path to allow matching
+// of arbitrary items:
+//
+// ```
+// labels."very complex label"==something
+// ```
+//
+// We also define `!=` and `~=` as operators. The `!=` will match all
+// objects that don't match the value for a field and `~=` will compile the
+// target value as a regular expression and match the field value against that.
+//
+// Selectors can be combined using a comma, such that the resulting
+// selector will require all selectors are matched for the object to match.
+// The following example will match objects that are named `foo` and have
+// the label `bar`:
+//
+// ```
+// name==foo,labels.bar
+// ```
+//
+package filters
+
+import (
+	"regexp"
+
+	"github.com/containerd/containerd/log"
+)
+
+type Filter interface {
+	Match(adaptor Adaptor) bool
+}
+
+type FilterFunc func(Adaptor) bool
+
+func (fn FilterFunc) Match(adaptor Adaptor) bool {
+	return fn(adaptor)
+}
+
+var Always FilterFunc = func(adaptor Adaptor) bool {
+	return true
+}
+
+type Any []Filter
+
+func (m Any) Match(adaptor Adaptor) bool {
+	for _, m := range m {
+		if m.Match(adaptor) {
+			return true
+		}
+	}
+
+	return false
+}
+
+type All []Filter
+
+func (m All) Match(adaptor Adaptor) bool {
+	for _, m := range m {
+		if !m.Match(adaptor) {
+			return false
+		}
+	}
+
+	return true
+}
+
+type operator int
+
+const (
+	operatorPresent = iota
+	operatorEqual
+	operatorNotEqual
+	operatorMatches
+)
+
+func (op operator) String() string {
+	switch op {
+	case operatorPresent:
+		return "?"
+	case operatorEqual:
+		return "=="
+	case operatorNotEqual:
+		return "!="
+	case operatorMatches:
+		return "~="
+	}
+
+	return "unknown"
+}
+
+type selector struct {
+	fieldpath []string
+	operator  operator
+	value     string
+	re        *regexp.Regexp
+}
+
+func (m selector) Match(adaptor Adaptor) bool {
+	value, present := adaptor.Field(m.fieldpath)
+
+	switch m.operator {
+	case operatorPresent:
+		return present
+	case operatorEqual:
+		return present && value == m.value
+	case operatorNotEqual:
+		return value != m.value
+	case operatorMatches:
+		if m.re == nil {
+			r, err := regexp.Compile(m.value)
+			if err != nil {
+				log.L.Errorf("error compiling regexp %q", m.value)
+				return false
+			}
+
+			m.re = r
+		}
+
+		return m.re.MatchString(value)
+	default:
+		return false
+	}
+}
--- a/filters/filter_test.go
+++ b/filters/filter_test.go
@ -0,0 +1,246 @@
+package filters
+
+import (
+	"reflect"
+	"strings"
+	"testing"
+)
+
+func TestFilters(t *testing.T) {
+	type cEntry struct {
+		Name   string
+		Other  string
+		Labels map[string]string
+	}
+
+	corpusS := []cEntry{
+		{
+			Name: "foo",
+			Labels: map[string]string{
+				"foo": "true",
+			},
+		},
+		{
+			Name: "bar",
+		},
+		{
+			Name: "foo",
+			Labels: map[string]string{
+				"foo":                "present",
+				"more complex label": "present",
+			},
+		},
+		{
+			Name: "bar",
+			Labels: map[string]string{
+				"bar": "true",
+			},
+		},
+		{
+			Name: "fooer",
+			Labels: map[string]string{
+				"more complex label with \\ and \"": "present",
+			},
+		},
+		{
+			Name: "fooer",
+			Labels: map[string]string{
+				"more complex label with \\ and \".post": "present",
+			},
+		},
+		{
+			Name:  "baz",
+			Other: "too complex, yo",
+		},
+		{
+			Name:  "bazo",
+			Other: "abc",
+		},
+	}
+
+	var corpus []interface{}
+	for _, entry := range corpusS {
+		corpus = append(corpus, entry)
+	}
+
+	// adapt shows an example of how to build an adaptor function for a type.
+	adapt := func(o interface{}) Adaptor {
+		obj := o.(cEntry)
+		return AdapterFunc(func(fieldpath []string) (string, bool) {
+			switch fieldpath[0] {
+			case "name":
+				return obj.Name, len(obj.Name) > 0
+			case "other":
+				return obj.Other, len(obj.Other) > 0
+			case "labels":
+				value, ok := obj.Labels[strings.Join(fieldpath[1:], ".")]
+				return value, ok
+			}
+
+			return "", false
+		})
+	}
+
+	for _, testcase := range []struct {
+		name      string
+		input     string
+		expected  []interface{}
+		errString string
+	}{
+		{
+			name:     "Empty",
+			input:    "",
+			expected: corpus,
+		},
+		{
+			name:     "Present",
+			input:    "name",
+			expected: corpus,
+		},
+		{
+			name:  "LabelPresent",
+			input: "labels.foo",
+			expected: []interface{}{
+				corpus[0],
+				corpus[2],
+			},
+		},
+		{
+			name:  "LabelValue",
+			input: "labels.foo==true",
+			expected: []interface{}{
+				corpus[0],
+			},
+		},
+		{
+			name:  "Name",
+			input: "name==bar",
+			expected: []interface{}{
+				corpus[1],
+				corpus[3],
+			},
+		},
+		{
+			name:  "NameNotEqual",
+			input: "name!=bar",
+			expected: []interface{}{
+				corpus[0],
+				corpus[2],
+				corpus[4],
+				corpus[5],
+				corpus[6],
+				corpus[7],
+			},
+		},
+		{
+			name:  "NameAndLabelPresent",
+			input: "name==bar,labels.bar",
+			expected: []interface{}{
+				corpus[3],
+			},
+		},
+		{
+			name:  "QuotedValue",
+			input: "other==\"too complex, yo\"",
+			expected: []interface{}{
+				corpus[6],
+			},
+		},
+		{
+			name:  "RegexpValue",
+			input: "other~=[abc]+,name!=foo",
+			expected: []interface{}{
+				corpus[6],
+				corpus[7],
+			},
+		},
+		{
+			name:  "NameAndLabelValue",
+			input: "name==bar,labels.bar==true",
+			expected: []interface{}{
+				corpus[3],
+			},
+		},
+		{
+			name:  "NameAndLabelValueNoMatch",
+			input: "name==bar,labels.bar==wrong",
+		},
+		{
+			name:  "LabelQuotedFieldPathPresent",
+			input: `name==foo,labels."more complex label"`,
+			expected: []interface{}{
+				corpus[2],
+			},
+		},
+		{
+			name:  "LabelQuotedFieldPathPresentWithQuoted",
+			input: `labels."more complex label with \\ and \""==present`,
+			expected: []interface{}{
+				corpus[4],
+			},
+		},
+		{
+			name:  "LabelQuotedFieldPathPresentWithQuotedEmbed",
+			input: `labels."more complex label with \\ and \"".post==present`,
+			expected: []interface{}{
+				corpus[5],
+			},
+		},
+		{
+			name:      "LabelQuotedFieldPathPresentWithQuotedEmbedInvalid",
+			input:     `labels.?"more complex label with \\ and \"".post==present`,
+			errString: `filters: parse error: [labels. >|?|< "more complex label with \\ and \"".post==present]: expected field or quoted`,
+		},
+		{
+			name:      "TrailingComma",
+			input:     "name==foo,",
+			errString: `filters: parse error: [name==foo,]: expected field or quoted`,
+		},
+		{
+			name:      "TrailingFieldSeparator",
+			input:     "labels.",
+			errString: `filters: parse error: [labels.]: expected field or quoted`,
+		},
+		{
+			name:      "MissingValue",
+			input:     "image~=,id?=?fbaq",
+			errString: `filters: parse error: [image~= >|,|< id?=?fbaq]: expected value or quoted`,
+		},
+	} {
+		t.Run(testcase.name, func(t *testing.T) {
+			t.Logf("testcase: %q", testcase.input)
+			filter, err := Parse(testcase.input)
+			if testcase.errString != "" {
+				if err == nil {
+					t.Fatalf("expected an error, but received nil")
+				}
+				if err.Error() != testcase.errString {
+					t.Fatalf("error %v != %v", err, testcase.errString)
+				}
+
+				return
+			} else {
+				if err != nil {
+					t.Fatal(err)
+				}
+			}
+
+			if filter == nil {
+				t.Fatal("filter should not be nil")
+			}
+
+			t.Log("filter", filter)
+			var results []interface{}
+			for _, item := range corpus {
+				adaptor := adapt(item)
+				if filter.Match(adaptor) {
+					results = append(results, item)
+				}
+			}
+
+			if !reflect.DeepEqual(results, testcase.expected) {
+				t.Fatalf("%q: %#v != %#v", testcase.input, results, testcase.expected)
+			}
+		})
+	}
+}
--- a/filters/parser.go
+++ b/filters/parser.go
@ -0,0 +1,239 @@
+package filters
+
+import (
+	"fmt"
+	"io"
+	"strconv"
+
+	"github.com/pkg/errors"
+)
+
+/*
+Parse the strings into a filter that may be used with an adaptor.
+
+The filter is made up of zero or more selectors.
+
+The format is a comma separated list of expressions, in the form of
+`<fieldpath><op><value>`, known as selectors. All selectors must match the
+target object for the filter to be true.
+
+We define the operators "==" for equality, "!=" for not equal and "~=" for a
+regular expression. If the operator and value are not present, the matcher will
+test for the presence of a value, as defined by the target object.
+
+The formal grammar is as follows:
+
+selectors := selector ("," selector)*
+selector  := fieldpath (operator value)
+fieldpath := field ('.' field)*
+field     := quoted | [A-Za-z] [A-Za-z0-9_]+
+operator  := "==" | "!=" | "~="
+value     := quoted | [^\s,]+
+quoted    := <go string syntax>
+
+*/
+func Parse(s string) (Filter, error) {
+	// special case empty to match all
+	if s == "" {
+		return Always, nil
+	}
+
+	p := parser{input: s}
+	return p.parse()
+}
+
+type parser struct {
+	input   string
+	scanner scanner
+}
+
+func (p *parser) parse() (Filter, error) {
+	p.scanner.init(p.input)
+
+	ss, err := p.selectors()
+	if err != nil {
+		return nil, errors.Wrap(err, "filters")
+	}
+
+	return ss, nil
+}
+
+func (p *parser) selectors() (Filter, error) {
+	s, err := p.selector()
+	if err != nil {
+		return nil, err
+	}
+
+	ss := All{s}
+
+loop:
+	for {
+		tok := p.scanner.peek()
+		switch tok {
+		case ',':
+			pos, tok, _ := p.scanner.scan()
+			if tok != tokenSelectorSeparator {
+				return nil, p.mkerr(pos, "expected a separator")
+			}
+
+			s, err := p.selector()
+			if err != nil {
+				return nil, err
+			}
+
+			ss = append(ss, s)
+		case tokenEOF:
+			break loop
+		default:
+			panic("unconsumed input")
+		}
+	}
+
+	return ss, nil
+}
+
+func (p *parser) selector() (selector, error) {
+	fieldpath, err := p.fieldpath()
+	if err != nil {
+		return selector{}, err
+	}
+
+	switch p.scanner.peek() {
+	case tokenSelectorSeparator, tokenEOF:
+		return selector{
+			fieldpath: fieldpath,
+			operator:  operatorPresent,
+		}, nil
+	}
+
+	op, err := p.operator()
+	if err != nil {
+		return selector{}, err
+	}
+
+	value, err := p.value()
+	if err != nil {
+		if err == io.EOF {
+			return selector{}, io.ErrUnexpectedEOF
+		}
+		return selector{}, err
+	}
+
+	return selector{
+		fieldpath: fieldpath,
+		value:     value,
+		operator:  op,
+	}, nil
+}
+
+func (p *parser) fieldpath() ([]string, error) {
+	f, err := p.field()
+	if err != nil {
+		return nil, err
+	}
+
+	fs := []string{f}
+loop:
+	for {
+		tok := p.scanner.peek() // lookahead to consume field separtor
+
+		switch tok {
+		case '.':
+			pos, tok, _ := p.scanner.scan() // consume separator
+			if tok != tokenFieldSeparator {
+				return nil, p.mkerr(pos, "expected a field separator (`.`)")
+			}
+
+			f, err := p.field()
+			if err != nil {
+				return nil, err
+			}
+
+			fs = append(fs, f)
+		default:
+			// let the layer above handle the other bad cases.
+			break loop
+		}
+	}
+
+	return fs, nil
+}
+
+func (p *parser) field() (string, error) {
+	pos, tok, s := p.scanner.scan()
+	switch tok {
+	case tokenField:
+		return s, nil
+	case tokenQuoted:
+		return p.unquote(pos, s)
+	}
+
+	return "", p.mkerr(pos, "expected field or quoted")
+}
+
+func (p *parser) operator() (operator, error) {
+	pos, tok, s := p.scanner.scan()
+	switch tok {
+	case tokenOperator:
+		switch s {
+		case "==":
+			return operatorEqual, nil
+		case "!=":
+			return operatorNotEqual, nil
+		case "~=":
+			return operatorMatches, nil
+		default:
+			return 0, p.mkerr(pos, "unsupported operator %q", s)
+		}
+	}
+
+	return 0, p.mkerr(pos, `expected an operator ("=="|"!="|"~=")`)
+}
+
+func (p *parser) value() (string, error) {
+	pos, tok, s := p.scanner.scan()
+
+	switch tok {
+	case tokenValue, tokenField:
+		return s, nil
+	case tokenQuoted:
+		return p.unquote(pos, s)
+	}
+
+	return "", p.mkerr(pos, "expected value or quoted")
+}
+
+func (p *parser) unquote(pos int, s string) (string, error) {
+	uq, err := strconv.Unquote(s)
+	if err != nil {
+		return "", p.mkerr(pos, "unquoting failed: %v", err)
+	}
+
+	return uq, nil
+}
+
+type parseError struct {
+	input string
+	pos   int
+	msg   string
+}
+
+func (pe parseError) Error() string {
+	if pe.pos < len(pe.input) {
+		before := pe.input[:pe.pos]
+		location := pe.input[pe.pos : pe.pos+1] // need to handle end
+		after := pe.input[pe.pos+1:]
+
+		return fmt.Sprintf("[%s >|%s|< %s]: %v", before, location, after, pe.msg)
+	}
+
+	return fmt.Sprintf("[%s]: %v", pe.input, pe.msg)
+}
+
+func (p *parser) mkerr(pos int, format string, args ...interface{}) error {
+	return errors.Wrap(parseError{
+		input: p.input,
+		pos:   pos,
+		msg:   fmt.Sprintf(format, args...),
+	}, "parse error")
+}
--- a/filters/scanner.go
+++ b/filters/scanner.go
@ -0,0 +1,279 @@
+package filters
+
+import (
+	"fmt"
+	"unicode"
+	"unicode/utf8"
+)
+
+const (
+	tokenEOF = -(iota + 1)
+	tokenQuoted
+	tokenValue
+	tokenField
+	tokenFieldSeparator
+	tokenOperator
+	tokenSelectorSeparator
+	tokenIllegal
+)
+
+type token rune
+
+func (t token) String() string {
+	switch t {
+	case tokenEOF:
+		return "EOF"
+	case tokenQuoted:
+		return "Quoted"
+	case tokenValue:
+		return "Value"
+	case tokenField:
+		return "Field"
+	case tokenOperator:
+		return "Operator"
+	case tokenFieldSeparator:
+		return "FieldSeparator"
+	case tokenSelectorSeparator:
+		return "SelectorSeparator"
+	case tokenIllegal:
+		return "Illegal"
+	}
+
+	return string(t)
+}
+
+func (t token) GoString() string {
+	return "token" + t.String()
+}
+
+type scanner struct {
+	input string
+	pos   int
+	ppos  int // bounds the current rune in the string
+	value bool
+}
+
+func (s *scanner) init(input string) {
+	s.input = input
+	s.pos = 0
+	s.ppos = 0
+}
+
+func (s *scanner) next() rune {
+	if s.pos >= len(s.input) {
+		return tokenEOF
+	}
+	s.pos = s.ppos
+
+	r, w := utf8.DecodeRuneInString(s.input[s.ppos:])
+	s.ppos += w
+	if r == utf8.RuneError {
+		if w > 0 {
+			return tokenIllegal
+		} else {
+			return tokenEOF
+		}
+	}
+
+	if r == 0 {
+		return tokenIllegal
+	}
+
+	return r
+}
+
+func (s *scanner) peek() rune {
+	pos := s.pos
+	ppos := s.ppos
+	ch := s.next()
+	s.pos = pos
+	s.ppos = ppos
+	return ch
+}
+
+func (s *scanner) scan() (int, token, string) {
+	var (
+		ch  = s.next()
+		pos = s.pos
+	)
+
+chomp:
+	switch {
+	case ch == tokenEOF:
+	case ch == tokenIllegal:
+	case isQuoteRune(ch):
+		s.scanString(ch)
+		return pos, tokenQuoted, s.input[pos:s.ppos]
+	case ch == ',':
+		return pos, tokenSelectorSeparator, s.input[pos:s.ppos]
+	case ch == '.':
+		return pos, tokenFieldSeparator, s.input[pos:s.ppos]
+	case isOperatorRune(ch):
+		s.scanOperator()
+		s.value = true
+		return pos, tokenOperator, s.input[pos:s.ppos]
+	case unicode.IsSpace(ch):
+		// chomp
+		ch = s.next()
+		pos = s.pos
+		goto chomp
+	case s.value:
+		s.scanValue()
+
+		// TODO(stevvooe): We can get rid of the value flag by by having a
+		// scanUnquoted that accumulates characters. If it is a legal field,
+		// then we return a field token. The parser can then treat fields as
+		// values. This will allow the default case here to just scan value or
+		// field.
+		s.value = false
+		return pos, tokenValue, s.input[pos:s.ppos]
+	case isFieldRune(ch):
+		s.scanField()
+		return pos, tokenField, s.input[pos:s.ppos]
+	}
+
+	return s.pos, token(ch), ""
+}
+
+func (s *scanner) scanField() {
+	for {
+		ch := s.peek()
+		if !isFieldRune(ch) {
+			break
+		}
+		s.next()
+	}
+}
+
+func (s *scanner) scanOperator() {
+	for {
+		ch := s.peek()
+		switch ch {
+		case '=', '!', '~':
+			s.next()
+		default:
+			return
+		}
+	}
+}
+
+func (s *scanner) scanValue() {
+	for {
+		ch := s.peek()
+		if !isValueRune(ch) {
+			break
+		}
+		s.next()
+	}
+}
+
+func (s *scanner) scanString(quote rune) {
+	ch := s.next() // read character after quote
+	for ch != quote {
+		if ch == '\n' || ch < 0 {
+			s.error("literal not terminated")
+			return
+		}
+		if ch == '\\' {
+			ch = s.scanEscape(quote)
+		} else {
+			ch = s.next()
+		}
+	}
+	return
+}
+
+func (s *scanner) scanEscape(quote rune) rune {
+	ch := s.next() // read character after '/'
+	switch ch {
+	case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
+		// nothing to do
+		ch = s.next()
+	case '0', '1', '2', '3', '4', '5', '6', '7':
+		ch = s.scanDigits(ch, 8, 3)
+	case 'x':
+		ch = s.scanDigits(s.next(), 16, 2)
+	case 'u':
+		ch = s.scanDigits(s.next(), 16, 4)
+	case 'U':
+		ch = s.scanDigits(s.next(), 16, 8)
+	default:
+		s.error("illegal char escape")
+	}
+	return ch
+}
+
+func (s *scanner) scanDigits(ch rune, base, n int) rune {
+	for n > 0 && digitVal(ch) < base {
+		ch = s.next()
+		n--
+	}
+	if n > 0 {
+		s.error("illegal char escape")
+	}
+	return ch
+}
+
+func (s *scanner) error(msg string) {
+	fmt.Println("error fixme", msg)
+}
+
+func digitVal(ch rune) int {
+	switch {
+	case '0' <= ch && ch <= '9':
+		return int(ch - '0')
+	case 'a' <= ch && ch <= 'f':
+		return int(ch - 'a' + 10)
+	case 'A' <= ch && ch <= 'F':
+		return int(ch - 'A' + 10)
+	}
+	return 16 // larger than any legal digit val
+}
+
+func isFieldRune(r rune) bool {
+	return (r == '_' || isAlphaRune(r) || isDigitRune(r))
+}
+
+func isAlphaRune(r rune) bool {
+	return r >= 'A' && r <= 'Z' || r >= 'a' && r <= 'z'
+}
+
+func isDigitRune(r rune) bool {
+	return r >= '0' && r <= '9'
+}
+
+func isOperatorRune(r rune) bool {
+	switch r {
+	case '=', '!', '~':
+		return true
+	}
+
+	return false
+}
+
+func isQuoteRune(r rune) bool {
+	switch r {
+	case '"': // maybe add single quoting?
+		return true
+	}
+
+	return false
+}
+
+func isSeparatorRune(r rune) bool {
+	switch r {
+	case ',', '.':
+		return true
+	}
+
+	return false
+}
+
+func isValueRune(r rune) bool {
+	return r != ',' && !unicode.IsSpace(r) &&
+		(unicode.IsLetter(r) ||
+			unicode.IsDigit(r) ||
+			unicode.IsNumber(r) ||
+			unicode.IsGraphic(r) ||
+			unicode.IsPunct(r))
+}
--- a/filters/scanner_test.go
+++ b/filters/scanner_test.go
@ -0,0 +1,236 @@
+package filters
+
+import (
+	"fmt"
+	"strconv"
+	"testing"
+)
+
+type tokenResult struct {
+	pos   int
+	token token
+	text  string
+}
+
+func (tr tokenResult) String() string {
+	return fmt.Sprintf("{pos: %v, token: %v, text: %q}", tr.pos, tr.token, tr.text)
+}
+
+func TestScanner(t *testing.T) {
+
+	for _, testcase := range []struct {
+		name     string
+		input    string
+		expected []tokenResult
+	}{
+		{
+			name:  "Field",
+			input: "name",
+			expected: []tokenResult{
+				{pos: 0, token: tokenField, text: "name"},
+				{pos: 4, token: tokenEOF},
+			},
+		},
+		{
+			name:  "SelectorsWithOperators",
+			input: "name==value,foo!=bar",
+			expected: []tokenResult{
+				{pos: 0, token: tokenField, text: "name"},
+				{pos: 4, token: tokenOperator, text: "=="},
+				{pos: 6, token: tokenValue, text: "value"},
+				{pos: 11, token: tokenSelectorSeparator, text: ","},
+				{pos: 12, token: tokenField, text: "foo"},
+				{pos: 15, token: tokenOperator, text: "!="},
+				{pos: 17, token: tokenValue, text: "bar"},
+				{pos: 20, token: tokenEOF},
+			},
+		},
+		{
+			name:  "SelectorsWithFieldPaths",
+			input: "name==value,labels.foo=value,other.bar~=match",
+			expected: []tokenResult{
+				{pos: 0, token: tokenField, text: "name"},
+				{pos: 4, token: tokenOperator, text: "=="},
+				{pos: 6, token: tokenValue, text: "value"},
+				{pos: 11, token: tokenSelectorSeparator, text: ","},
+				{pos: 12, token: tokenField, text: "labels"},
+				{pos: 18, token: tokenFieldSeparator, text: "."},
+				{pos: 19, token: tokenField, text: "foo"},
+				{pos: 22, token: tokenOperator, text: "="},
+				{pos: 23, token: tokenValue, text: "value"},
+				{pos: 28, token: tokenSelectorSeparator, text: ","},
+				{pos: 29, token: tokenField, text: "other"},
+				{pos: 34, token: tokenFieldSeparator, text: "."},
+				{pos: 35, token: tokenField, text: "bar"},
+				{pos: 38, token: tokenOperator, text: "~="},
+				{pos: 40, token: tokenValue, text: "match"},
+				{pos: 45, token: tokenEOF},
+			},
+		},
+		{
+			name:  "RegexpValue",
+			input: "name~=[abc]+,foo=test",
+			expected: []tokenResult{
+				{pos: 0, token: tokenField, text: "name"},
+				{pos: 4, token: tokenOperator, text: "~="},
+				{pos: 6, token: tokenValue, text: "[abc]+"},
+				{pos: 12, token: tokenSelectorSeparator, text: ","},
+				{pos: 13, token: tokenField, text: "foo"},
+				{pos: 16, token: tokenOperator, text: "="},
+				{pos: 17, token: tokenValue, text: "test"},
+				{pos: 21, token: tokenEOF},
+			},
+		},
+		{
+			name:  "RegexpEscapedValue",
+			input: `name~=[abc]\+,foo=test`,
+			expected: []tokenResult{
+				{pos: 0, token: tokenField, text: "name"},
+				{pos: 4, token: tokenOperator, text: "~="},
+				{pos: 6, token: tokenValue, text: "[abc]\\+"},
+				{pos: 13, token: tokenSelectorSeparator, text: ","},
+				{pos: 14, token: tokenField, text: "foo"},
+				{pos: 17, token: tokenOperator, text: "="},
+				{pos: 18, token: tokenValue, text: "test"},
+				{pos: 22, token: tokenEOF},
+			},
+		},
+		{
+			name:  "Cowsay",
+			input: "name~=牛,labels.moo=true",
+			expected: []tokenResult{
+				{pos: 0, token: tokenField, text: "name"},
+				{pos: 4, token: tokenOperator, text: "~="},
+				{pos: 6, token: tokenValue, text: "牛"},
+				{pos: 9, token: tokenSelectorSeparator, text: ","},
+				{pos: 10, token: tokenField, text: "labels"},
+				{pos: 16, token: tokenFieldSeparator, text: "."},
+				{pos: 17, token: tokenField, text: "moo"},
+				{pos: 20, token: tokenOperator, text: "="},
+				{pos: 21, token: tokenValue, text: "true"},
+				{pos: 25, token: tokenEOF},
+			},
+		},
+		{
+			name:  "Escapes",
+			input: `name~="asdf\n\tfooo"`,
+			expected: []tokenResult{
+				{pos: 0, token: tokenField, text: "name"},
+				{pos: 4, token: tokenOperator, text: "~="},
+				{pos: 6, token: tokenQuoted, text: "\"asdf\\n\\tfooo\""},
+				{pos: 20, token: tokenEOF},
+			},
+		},
+		{
+			name:  "NullInput",
+			input: "foo\x00bar",
+			expected: []tokenResult{
+				{pos: 0, token: tokenField, text: "foo"},
+				{pos: 3, token: tokenIllegal},
+				{pos: 4, token: tokenField, text: "bar"},
+				{pos: 7, token: tokenEOF},
+			},
+		},
+		{
+			name:  "SpacesChomped",
+			input: "foo = bar    ",
+			expected: []tokenResult{
+				{pos: 0, token: tokenField, text: "foo"},
+				{pos: 4, token: tokenOperator, text: "="},
+				{pos: 6, token: tokenValue, text: "bar"},
+				{pos: 13, token: tokenEOF},
+			},
+		},
+		{
+			name:  "PartialInput",
+			input: "interrupted=",
+			expected: []tokenResult{
+				{pos: 0, token: tokenField, text: "interrupted"},
+				{pos: 11, token: tokenOperator, text: "="},
+				{pos: 12, token: tokenEOF},
+			},
+		},
+		{
+			name:  "DoubleValue",
+			input: "doublevalue=value value",
+			expected: []tokenResult{
+				{pos: 0, token: tokenField, text: "doublevalue"},
+				{pos: 11, token: tokenOperator, text: "="},
+				{pos: 12, token: tokenValue, text: "value"},
+				{pos: 18, token: tokenField, text: "value"},
+				{pos: 23, token: tokenEOF},
+			},
+		},
+		{
+			name:  "LeadingWithQuoted",
+			input: `"leading quote".postquote==value`,
+			expected: []tokenResult{
+				{pos: 0, token: tokenQuoted, text: "\"leading quote\""},
+				{pos: 15, token: tokenFieldSeparator, text: "."},
+				{pos: 16, token: tokenField, text: "postquote"},
+				{pos: 25, token: tokenOperator, text: "=="},
+				{pos: 27, token: tokenValue, text: "value"},
+				{pos: 32, token: tokenEOF},
+			},
+		},
+		{
+			name:  "MissingValue",
+			input: "input==,id?=ff",
+			expected: []tokenResult{
+				{pos: 0, token: tokenField, text: "input"},
+				{pos: 5, token: tokenOperator, text: "=="},
+				{pos: 7, token: tokenSelectorSeparator, text: ","},
+				{pos: 8, token: tokenValue, text: "id?=ff"},
+				{pos: 14, token: tokenEOF},
+			},
+		},
+	} {
+		t.Run(testcase.name, func(t *testing.T) {
+			var sc scanner
+			sc.init(testcase.input)
+			t.Logf("scan %q", testcase.input)
+
+			// If you leave the expected empty, the test case will just print
+			// out the token stream, which you can paste into the testcase when
+			// adding new cases.
+			if len(testcase.expected) == 0 {
+				fmt.Println("Name", testcase.name)
+			}
+
+			for i := 0; ; i++ {
+				pos, tok, s := sc.scan()
+				t.Log("token", pos, tok, strconv.Quote(s))
+				if len(testcase.expected) == 0 {
+					if len(s) > 0 {
+						fmt.Printf("{pos: %v, token: %#v, text: %q},\n", pos, tok, s)
+					} else {
+						fmt.Printf("{pos: %v, token: %#v},\n", pos, tok)
+					}
+				} else {
+					tokv := tokenResult{pos: pos, token: tok, text: s}
+					if i >= len(testcase.expected) {
+						t.Fatalf("too many tokens parsed")
+					}
+
+					if tokv != testcase.expected[i] {
+						t.Fatalf("token unexpected: %v != %v", tokv, testcase.expected[i])
+					}
+				}
+
+				if tok == tokenEOF {
+					break
+				}
+			}
+
+			// make sure we've eof'd
+			_, tok, _ := sc.scan()
+			if tok != tokenEOF {
+				t.Fatal("must consume all input")
+			}
+
+			if len(testcase.expected) == 0 {
+				t.Fatal("must define expected tokens")
+			}
+		})
+	}
+}