* update bleve to master b17287a86f6cac923a5d886e10618df994eeb54b6724eac2e3b8dde89cfbe3a2 * remove unused pkg from dep file * change bleve from master to recent revisiontags/v1.9.0-dev
| @@ -40,14 +40,6 @@ | |||
| revision = "1a28a7fa985680f9f4e1644c0a857ec359a444b0" | |||
| version = "v0.4.7" | |||
| [[projects]] | |||
| branch = "master" | |||
| digest = "1:93367b6d47a8ccc7d14f9f493ccf103ccf5afb698559ff8e8f1999427ce27ace" | |||
| name = "github.com/Smerity/govarint" | |||
| packages = ["."] | |||
| pruneopts = "NUT" | |||
| revision = "7265e41f48f15fd61751e16da866af3c704bb3ab" | |||
| [[projects]] | |||
| branch = "master" | |||
| digest = "1:d290f4b25abbf574f80f60c8a5603ddada784f13f436b91a9a927bc7ce5a0146" | |||
| @@ -98,7 +90,8 @@ | |||
| revision = "3a771d992973f24aa725d07868b467d1ddfceafb" | |||
| [[projects]] | |||
| digest = "1:c10f35be6200b09e26da267ca80f837315093ecaba27e7a223071380efb9dd32" | |||
| branch = "master" | |||
| digest = "1:b17287a86f6cac923a5d886e10618df994eeb54b6724eac2e3b8dde89cfbe3a2" | |||
| name = "github.com/blevesearch/bleve" | |||
| packages = [ | |||
| ".", | |||
| @@ -121,7 +114,6 @@ | |||
| "index/scorch", | |||
| "index/scorch/mergeplan", | |||
| "index/scorch/segment", | |||
| "index/scorch/segment/mem", | |||
| "index/scorch/segment/zap", | |||
| "index/store", | |||
| "index/store/boltdb", | |||
| @@ -141,9 +133,10 @@ | |||
| "search/query", | |||
| "search/scorer", | |||
| "search/searcher", | |||
| "size", | |||
| ] | |||
| pruneopts = "NUT" | |||
| revision = "c74e08f039e56cef576e4336382b2a2d12d9e026" | |||
| revision = "05d86ea8f6e30456949f612cf68cf4a27ce8c9c5" | |||
| [[projects]] | |||
| branch = "master" | |||
| @@ -160,14 +153,6 @@ | |||
| pruneopts = "NUT" | |||
| revision = "db70c57796cc8c310613541dfade3dce627d09c7" | |||
| [[projects]] | |||
| digest = "1:c7e0968c05659f3973148cd5c5387d6ee960a6ae1b2eaaec0b1d435d806458bb" | |||
| name = "github.com/boltdb/bolt" | |||
| packages = ["."] | |||
| pruneopts = "NUT" | |||
| revision = "ccd680d8c1a0179ac3d68f692b01e1a1589cbfc7" | |||
| source = "github.com/go-gitea/bolt" | |||
| [[projects]] | |||
| digest = "1:7c96cf7bf7f52af67f7a8222185813b9b665f5172ec2ac5f7d49ed96e5fcf3e5" | |||
| name = "github.com/boombuler/barcode" | |||
| @@ -217,15 +202,16 @@ | |||
| [[projects]] | |||
| branch = "master" | |||
| digest = "1:82e1ad11d777f7bff9a1fc678a8a534a318f85e5026a8a4d6f4a94a6b0678bb6" | |||
| digest = "1:6a658ac7d23204dc743c7155557c45273747d78e05ae0579742bd6b744bce215" | |||
| name = "github.com/couchbase/vellum" | |||
| packages = [ | |||
| ".", | |||
| "levenshtein2", | |||
| "regexp", | |||
| "utf8", | |||
| ] | |||
| pruneopts = "NUT" | |||
| revision = "eb6ae3743b3f300f2136f83ca78c08cc071edbd4" | |||
| revision = "e91b68ff3efe3cc11723aa25dd315cbc9276cd65" | |||
| [[projects]] | |||
| branch = "master" | |||
| @@ -287,6 +273,14 @@ | |||
| revision = "1615341f118ae12f353cc8a983f35b584342c9b3" | |||
| version = "v1.12.0" | |||
| [[projects]] | |||
| digest = "1:ae8eea1a24ae43a46c2e96631b6303fcc4210ca0ac9d643e4da965029d1b511d" | |||
| name = "github.com/etcd-io/bbolt" | |||
| packages = ["."] | |||
| pruneopts = "NUT" | |||
| revision = "63597a96ec0ad9e6d43c3fc81e809909e0237461" | |||
| version = "v1.3.2" | |||
| [[projects]] | |||
| digest = "1:8603f74d35c93b37c615a02ba297be2cf2efc9ff6f1ff2b458a903990b568e48" | |||
| name = "github.com/ethantkoenig/rupture" | |||
| @@ -15,10 +15,8 @@ ignored = ["google.golang.org/appengine*"] | |||
| name = "code.gitea.io/sdk" | |||
| [[constraint]] | |||
| # branch = "master" | |||
| revision = "c74e08f039e56cef576e4336382b2a2d12d9e026" | |||
| revision = "05d86ea8f6e30456949f612cf68cf4a27ce8c9c5" | |||
| name = "github.com/blevesearch/bleve" | |||
| #Not targetting v0.7.0 since standard where use only just after this tag | |||
| [[constraint]] | |||
| revision = "12dd70caea0268ac0d6c2707d0611ef601e7c64e" | |||
| @@ -108,11 +106,6 @@ ignored = ["google.golang.org/appengine*"] | |||
| name = "gopkg.in/testfixtures.v2" | |||
| version = "2.0.0" | |||
| [[override]] | |||
| name = "github.com/boltdb/bolt" | |||
| revision = "ccd680d8c1a0179ac3d68f692b01e1a1589cbfc7" | |||
| source = "github.com/go-gitea/bolt" | |||
| [[override]] | |||
| branch = "master" | |||
| name = "golang.org/x/oauth2" | |||
| @@ -1,22 +0,0 @@ | |||
| The MIT License (MIT) | |||
| Copyright (c) 2015 Stephen Merity | |||
| Permission is hereby granted, free of charge, to any person obtaining a copy | |||
| of this software and associated documentation files (the "Software"), to deal | |||
| in the Software without restriction, including without limitation the rights | |||
| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||
| copies of the Software, and to permit persons to whom the Software is | |||
| furnished to do so, subject to the following conditions: | |||
| The above copyright notice and this permission notice shall be included in all | |||
| copies or substantial portions of the Software. | |||
| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||
| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||
| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||
| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||
| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
| SOFTWARE. | |||
| @@ -1,229 +0,0 @@ | |||
| package govarint | |||
| import "encoding/binary" | |||
| import "io" | |||
| type U32VarintEncoder interface { | |||
| PutU32(x uint32) int | |||
| Close() | |||
| } | |||
| type U32VarintDecoder interface { | |||
| GetU32() (uint32, error) | |||
| } | |||
| /// | |||
| type U64VarintEncoder interface { | |||
| PutU64(x uint64) int | |||
| Close() | |||
| } | |||
| type U64VarintDecoder interface { | |||
| GetU64() (uint64, error) | |||
| } | |||
| /// | |||
| type U32GroupVarintEncoder struct { | |||
| w io.Writer | |||
| index int | |||
| store [4]uint32 | |||
| temp [17]byte | |||
| } | |||
| func NewU32GroupVarintEncoder(w io.Writer) *U32GroupVarintEncoder { return &U32GroupVarintEncoder{w: w} } | |||
| func (b *U32GroupVarintEncoder) Flush() (int, error) { | |||
| // TODO: Is it more efficient to have a tailored version that's called only in Close()? | |||
| // If index is zero, there are no integers to flush | |||
| if b.index == 0 { | |||
| return 0, nil | |||
| } | |||
| // In the case we're flushing (the group isn't of size four), the non-values should be zero | |||
| // This ensures the unused entries are all zero in the sizeByte | |||
| for i := b.index; i < 4; i++ { | |||
| b.store[i] = 0 | |||
| } | |||
| length := 1 | |||
| // We need to reset the size byte to zero as we only bitwise OR into it, we don't overwrite it | |||
| b.temp[0] = 0 | |||
| for i, x := range b.store { | |||
| size := byte(0) | |||
| shifts := []byte{24, 16, 8, 0} | |||
| for _, shift := range shifts { | |||
| // Always writes at least one byte -- the first one (shift = 0) | |||
| // Will write more bytes until the rest of the integer is all zeroes | |||
| if (x>>shift) != 0 || shift == 0 { | |||
| size += 1 | |||
| b.temp[length] = byte(x >> shift) | |||
| length += 1 | |||
| } | |||
| } | |||
| // We store the size in two of the eight bits in the first byte (sizeByte) | |||
| // 0 means there is one byte in total, hence why we subtract one from size | |||
| b.temp[0] |= (size - 1) << (uint8(3-i) * 2) | |||
| } | |||
| // If we're flushing without a full group of four, remove the unused bytes we computed | |||
| // This enables us to realize it's a partial group on decoding thanks to EOF | |||
| if b.index != 4 { | |||
| length -= 4 - b.index | |||
| } | |||
| _, err := b.w.Write(b.temp[:length]) | |||
| return length, err | |||
| } | |||
| func (b *U32GroupVarintEncoder) PutU32(x uint32) (int, error) { | |||
| bytesWritten := 0 | |||
| b.store[b.index] = x | |||
| b.index += 1 | |||
| if b.index == 4 { | |||
| n, err := b.Flush() | |||
| if err != nil { | |||
| return n, err | |||
| } | |||
| bytesWritten += n | |||
| b.index = 0 | |||
| } | |||
| return bytesWritten, nil | |||
| } | |||
| func (b *U32GroupVarintEncoder) Close() { | |||
| // On Close, we flush any remaining values that might not have been in a full group | |||
| b.Flush() | |||
| } | |||
| /// | |||
| type U32GroupVarintDecoder struct { | |||
| r io.ByteReader | |||
| group [4]uint32 | |||
| pos int | |||
| finished bool | |||
| capacity int | |||
| } | |||
| func NewU32GroupVarintDecoder(r io.ByteReader) *U32GroupVarintDecoder { | |||
| return &U32GroupVarintDecoder{r: r, pos: 4, capacity: 4} | |||
| } | |||
| func (b *U32GroupVarintDecoder) getGroup() error { | |||
| // We should always receive a sizeByte if there are more values to read | |||
| sizeByte, err := b.r.ReadByte() | |||
| if err != nil { | |||
| return err | |||
| } | |||
| // Calculate the size of the four incoming 32 bit integers | |||
| // 0b00 means 1 byte to read, 0b01 = 2, etc | |||
| b.group[0] = uint32((sizeByte >> 6) & 3) | |||
| b.group[1] = uint32((sizeByte >> 4) & 3) | |||
| b.group[2] = uint32((sizeByte >> 2) & 3) | |||
| b.group[3] = uint32(sizeByte & 3) | |||
| // | |||
| for index, size := range b.group { | |||
| b.group[index] = 0 | |||
| // Any error that occurs in earlier byte reads should be repeated at the end one | |||
| // Hence we only catch and report the final ReadByte's error | |||
| var err error | |||
| switch size { | |||
| case 0: | |||
| var x byte | |||
| x, err = b.r.ReadByte() | |||
| b.group[index] = uint32(x) | |||
| case 1: | |||
| var x, y byte | |||
| x, _ = b.r.ReadByte() | |||
| y, err = b.r.ReadByte() | |||
| b.group[index] = uint32(x)<<8 | uint32(y) | |||
| case 2: | |||
| var x, y, z byte | |||
| x, _ = b.r.ReadByte() | |||
| y, _ = b.r.ReadByte() | |||
| z, err = b.r.ReadByte() | |||
| b.group[index] = uint32(x)<<16 | uint32(y)<<8 | uint32(z) | |||
| case 3: | |||
| var x, y, z, zz byte | |||
| x, _ = b.r.ReadByte() | |||
| y, _ = b.r.ReadByte() | |||
| z, _ = b.r.ReadByte() | |||
| zz, err = b.r.ReadByte() | |||
| b.group[index] = uint32(x)<<24 | uint32(y)<<16 | uint32(z)<<8 | uint32(zz) | |||
| } | |||
| if err != nil { | |||
| if err == io.EOF { | |||
| // If we hit EOF here, we have found a partial group | |||
| // We've return any valid entries we have read and return EOF once we run out | |||
| b.capacity = index | |||
| b.finished = true | |||
| break | |||
| } else { | |||
| return err | |||
| } | |||
| } | |||
| } | |||
| // Reset the pos pointer to the beginning of the read values | |||
| b.pos = 0 | |||
| return nil | |||
| } | |||
| func (b *U32GroupVarintDecoder) GetU32() (uint32, error) { | |||
| // Check if we have any more values to give out - if not, let's get them | |||
| if b.pos == b.capacity { | |||
| // If finished is set, there is nothing else to do | |||
| if b.finished { | |||
| return 0, io.EOF | |||
| } | |||
| err := b.getGroup() | |||
| if err != nil { | |||
| return 0, err | |||
| } | |||
| } | |||
| // Increment pointer and return the value stored at that point | |||
| b.pos += 1 | |||
| return b.group[b.pos-1], nil | |||
| } | |||
| /// | |||
| type Base128Encoder struct { | |||
| w io.Writer | |||
| tmpBytes []byte | |||
| } | |||
| func NewU32Base128Encoder(w io.Writer) *Base128Encoder { | |||
| return &Base128Encoder{w: w, tmpBytes: make([]byte, binary.MaxVarintLen32)} | |||
| } | |||
| func NewU64Base128Encoder(w io.Writer) *Base128Encoder { | |||
| return &Base128Encoder{w: w, tmpBytes: make([]byte, binary.MaxVarintLen64)} | |||
| } | |||
| func (b *Base128Encoder) PutU32(x uint32) (int, error) { | |||
| writtenBytes := binary.PutUvarint(b.tmpBytes, uint64(x)) | |||
| return b.w.Write(b.tmpBytes[:writtenBytes]) | |||
| } | |||
| func (b *Base128Encoder) PutU64(x uint64) (int, error) { | |||
| writtenBytes := binary.PutUvarint(b.tmpBytes, x) | |||
| return b.w.Write(b.tmpBytes[:writtenBytes]) | |||
| } | |||
| func (b *Base128Encoder) Close() { | |||
| } | |||
| /// | |||
| type Base128Decoder struct { | |||
| r io.ByteReader | |||
| } | |||
| func NewU32Base128Decoder(r io.ByteReader) *Base128Decoder { return &Base128Decoder{r: r} } | |||
| func NewU64Base128Decoder(r io.ByteReader) *Base128Decoder { return &Base128Decoder{r: r} } | |||
| func (b *Base128Decoder) GetU32() (uint32, error) { | |||
| v, err := binary.ReadUvarint(b.r) | |||
| return uint32(v), err | |||
| } | |||
| func (b *Base128Decoder) GetU64() (uint64, error) { | |||
| return binary.ReadUvarint(b.r) | |||
| } | |||
| @@ -14,6 +14,22 @@ | |||
| package analysis | |||
| import ( | |||
| "reflect" | |||
| "github.com/blevesearch/bleve/size" | |||
| ) | |||
| var reflectStaticSizeTokenLocation int | |||
| var reflectStaticSizeTokenFreq int | |||
| func init() { | |||
| var tl TokenLocation | |||
| reflectStaticSizeTokenLocation = int(reflect.TypeOf(tl).Size()) | |||
| var tf TokenFreq | |||
| reflectStaticSizeTokenFreq = int(reflect.TypeOf(tf).Size()) | |||
| } | |||
| // TokenLocation represents one occurrence of a term at a particular location in | |||
| // a field. Start, End and Position have the same meaning as in analysis.Token. | |||
| // Field and ArrayPositions identify the field value in the source document. | |||
| @@ -26,6 +42,12 @@ type TokenLocation struct { | |||
| Position int | |||
| } | |||
| func (tl *TokenLocation) Size() int { | |||
| rv := reflectStaticSizeTokenLocation | |||
| rv += len(tl.ArrayPositions) * size.SizeOfUint64 | |||
| return rv | |||
| } | |||
| // TokenFreq represents all the occurrences of a term in all fields of a | |||
| // document. | |||
| type TokenFreq struct { | |||
| @@ -34,6 +56,15 @@ type TokenFreq struct { | |||
| frequency int | |||
| } | |||
| func (tf *TokenFreq) Size() int { | |||
| rv := reflectStaticSizeTokenFreq | |||
| rv += len(tf.Term) | |||
| for _, loc := range tf.Locations { | |||
| rv += loc.Size() | |||
| } | |||
| return rv | |||
| } | |||
| func (tf *TokenFreq) Frequency() int { | |||
| return tf.frequency | |||
| } | |||
| @@ -42,6 +73,16 @@ func (tf *TokenFreq) Frequency() int { | |||
| // fields. | |||
| type TokenFrequencies map[string]*TokenFreq | |||
| func (tfs TokenFrequencies) Size() int { | |||
| rv := size.SizeOfMap | |||
| rv += len(tfs) * (size.SizeOfString + size.SizeOfPtr) | |||
| for k, v := range tfs { | |||
| rv += len(k) | |||
| rv += v.Size() | |||
| } | |||
| return rv | |||
| } | |||
| func (tfs TokenFrequencies) MergeAll(remoteField string, other TokenFrequencies) { | |||
| // walk the new token frequencies | |||
| for tfk, tf := range other { | |||
| @@ -46,11 +46,11 @@ type Parser struct { | |||
| index int | |||
| } | |||
| func NewParser(len, position, index int) *Parser { | |||
| func NewParser(length, position, index int) *Parser { | |||
| return &Parser{ | |||
| bufferLen: len, | |||
| buffer: make([]rune, 0, len), | |||
| tokens: make([]*analysis.Token, 0, len), | |||
| bufferLen: length, | |||
| buffer: make([]rune, 0, length), | |||
| tokens: make([]*analysis.Token, 0, length), | |||
| position: position, | |||
| index: index, | |||
| } | |||
| @@ -21,7 +21,7 @@ import ( | |||
| const Name = "unique" | |||
| // UniqueTermFilter retains only the tokens which mark the first occurence of | |||
| // UniqueTermFilter retains only the tokens which mark the first occurrence of | |||
| // a term. Tokens whose term appears in a preceding token are dropped. | |||
| type UniqueTermFilter struct{} | |||
| @@ -14,7 +14,19 @@ | |||
| package document | |||
| import "fmt" | |||
| import ( | |||
| "fmt" | |||
| "reflect" | |||
| "github.com/blevesearch/bleve/size" | |||
| ) | |||
| var reflectStaticSizeDocument int | |||
| func init() { | |||
| var d Document | |||
| reflectStaticSizeDocument = int(reflect.TypeOf(d).Size()) | |||
| } | |||
| type Document struct { | |||
| ID string `json:"id"` | |||
| @@ -30,6 +42,21 @@ func NewDocument(id string) *Document { | |||
| } | |||
| } | |||
| func (d *Document) Size() int { | |||
| sizeInBytes := reflectStaticSizeDocument + size.SizeOfPtr + | |||
| len(d.ID) | |||
| for _, entry := range d.Fields { | |||
| sizeInBytes += entry.Size() | |||
| } | |||
| for _, entry := range d.CompositeFields { | |||
| sizeInBytes += entry.Size() | |||
| } | |||
| return sizeInBytes | |||
| } | |||
| func (d *Document) AddField(f Field) *Document { | |||
| switch f := f.(type) { | |||
| case *CompositeField: | |||
| @@ -36,4 +36,6 @@ type Field interface { | |||
| // that this field represents - this is a common metric for tracking | |||
| // the rate of indexing | |||
| NumPlainTextBytes() uint64 | |||
| Size() int | |||
| } | |||
| @@ -16,10 +16,19 @@ package document | |||
| import ( | |||
| "fmt" | |||
| "reflect" | |||
| "github.com/blevesearch/bleve/analysis" | |||
| "github.com/blevesearch/bleve/size" | |||
| ) | |||
| var reflectStaticSizeBooleanField int | |||
| func init() { | |||
| var f BooleanField | |||
| reflectStaticSizeBooleanField = int(reflect.TypeOf(f).Size()) | |||
| } | |||
| const DefaultBooleanIndexingOptions = StoreField | IndexField | DocValues | |||
| type BooleanField struct { | |||
| @@ -30,6 +39,13 @@ type BooleanField struct { | |||
| numPlainTextBytes uint64 | |||
| } | |||
| func (b *BooleanField) Size() int { | |||
| return reflectStaticSizeBooleanField + size.SizeOfPtr + | |||
| len(b.name) + | |||
| len(b.arrayPositions)*size.SizeOfUint64 + | |||
| len(b.value) | |||
| } | |||
| func (b *BooleanField) Name() string { | |||
| return b.name | |||
| } | |||
| @@ -15,9 +15,19 @@ | |||
| package document | |||
| import ( | |||
| "reflect" | |||
| "github.com/blevesearch/bleve/analysis" | |||
| "github.com/blevesearch/bleve/size" | |||
| ) | |||
| var reflectStaticSizeCompositeField int | |||
| func init() { | |||
| var cf CompositeField | |||
| reflectStaticSizeCompositeField = int(reflect.TypeOf(cf).Size()) | |||
| } | |||
| const DefaultCompositeIndexingOptions = IndexField | |||
| type CompositeField struct { | |||
| @@ -54,6 +64,21 @@ func NewCompositeFieldWithIndexingOptions(name string, defaultInclude bool, incl | |||
| return rv | |||
| } | |||
| func (c *CompositeField) Size() int { | |||
| sizeInBytes := reflectStaticSizeCompositeField + size.SizeOfPtr + | |||
| len(c.name) | |||
| for k, _ := range c.includedFields { | |||
| sizeInBytes += size.SizeOfString + len(k) + size.SizeOfBool | |||
| } | |||
| for k, _ := range c.excludedFields { | |||
| sizeInBytes += size.SizeOfString + len(k) + size.SizeOfBool | |||
| } | |||
| return sizeInBytes | |||
| } | |||
| func (c *CompositeField) Name() string { | |||
| return c.name | |||
| } | |||
| @@ -17,12 +17,21 @@ package document | |||
| import ( | |||
| "fmt" | |||
| "math" | |||
| "reflect" | |||
| "time" | |||
| "github.com/blevesearch/bleve/analysis" | |||
| "github.com/blevesearch/bleve/numeric" | |||
| "github.com/blevesearch/bleve/size" | |||
| ) | |||
| var reflectStaticSizeDateTimeField int | |||
| func init() { | |||
| var f DateTimeField | |||
| reflectStaticSizeDateTimeField = int(reflect.TypeOf(f).Size()) | |||
| } | |||
| const DefaultDateTimeIndexingOptions = StoreField | IndexField | DocValues | |||
| const DefaultDateTimePrecisionStep uint = 4 | |||
| @@ -37,6 +46,12 @@ type DateTimeField struct { | |||
| numPlainTextBytes uint64 | |||
| } | |||
| func (n *DateTimeField) Size() int { | |||
| return reflectStaticSizeDateTimeField + size.SizeOfPtr + | |||
| len(n.name) + | |||
| len(n.arrayPositions)*size.SizeOfUint64 | |||
| } | |||
| func (n *DateTimeField) Name() string { | |||
| return n.name | |||
| } | |||
| @@ -16,12 +16,21 @@ package document | |||
| import ( | |||
| "fmt" | |||
| "reflect" | |||
| "github.com/blevesearch/bleve/analysis" | |||
| "github.com/blevesearch/bleve/geo" | |||
| "github.com/blevesearch/bleve/numeric" | |||
| "github.com/blevesearch/bleve/size" | |||
| ) | |||
| var reflectStaticSizeGeoPointField int | |||
| func init() { | |||
| var f GeoPointField | |||
| reflectStaticSizeGeoPointField = int(reflect.TypeOf(f).Size()) | |||
| } | |||
| var GeoPrecisionStep uint = 9 | |||
| type GeoPointField struct { | |||
| @@ -32,6 +41,12 @@ type GeoPointField struct { | |||
| numPlainTextBytes uint64 | |||
| } | |||
| func (n *GeoPointField) Size() int { | |||
| return reflectStaticSizeGeoPointField + size.SizeOfPtr + | |||
| len(n.name) + | |||
| len(n.arrayPositions)*size.SizeOfUint64 | |||
| } | |||
| func (n *GeoPointField) Name() string { | |||
| return n.name | |||
| } | |||
| @@ -16,11 +16,20 @@ package document | |||
| import ( | |||
| "fmt" | |||
| "reflect" | |||
| "github.com/blevesearch/bleve/analysis" | |||
| "github.com/blevesearch/bleve/numeric" | |||
| "github.com/blevesearch/bleve/size" | |||
| ) | |||
| var reflectStaticSizeNumericField int | |||
| func init() { | |||
| var f NumericField | |||
| reflectStaticSizeNumericField = int(reflect.TypeOf(f).Size()) | |||
| } | |||
| const DefaultNumericIndexingOptions = StoreField | IndexField | DocValues | |||
| const DefaultPrecisionStep uint = 4 | |||
| @@ -33,6 +42,12 @@ type NumericField struct { | |||
| numPlainTextBytes uint64 | |||
| } | |||
| func (n *NumericField) Size() int { | |||
| return reflectStaticSizeNumericField + size.SizeOfPtr + | |||
| len(n.name) + | |||
| len(n.arrayPositions)*size.SizeOfPtr | |||
| } | |||
| func (n *NumericField) Name() string { | |||
| return n.name | |||
| } | |||
| @@ -16,10 +16,19 @@ package document | |||
| import ( | |||
| "fmt" | |||
| "reflect" | |||
| "github.com/blevesearch/bleve/analysis" | |||
| "github.com/blevesearch/bleve/size" | |||
| ) | |||
| var reflectStaticSizeTextField int | |||
| func init() { | |||
| var f TextField | |||
| reflectStaticSizeTextField = int(reflect.TypeOf(f).Size()) | |||
| } | |||
| const DefaultTextIndexingOptions = IndexField | DocValues | |||
| type TextField struct { | |||
| @@ -31,6 +40,13 @@ type TextField struct { | |||
| numPlainTextBytes uint64 | |||
| } | |||
| func (t *TextField) Size() int { | |||
| return reflectStaticSizeTextField + size.SizeOfPtr + | |||
| len(t.name) + | |||
| len(t.arrayPositions)*size.SizeOfUint64 + | |||
| len(t.value) | |||
| } | |||
| func (t *TextField) Name() string { | |||
| return t.name | |||
| } | |||
| @@ -0,0 +1,174 @@ | |||
| // The code here was obtained from: | |||
| // https://github.com/mmcloughlin/geohash | |||
| // The MIT License (MIT) | |||
| // Copyright (c) 2015 Michael McLoughlin | |||
| // Permission is hereby granted, free of charge, to any person obtaining a copy | |||
| // of this software and associated documentation files (the "Software"), to deal | |||
| // in the Software without restriction, including without limitation the rights | |||
| // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||
| // copies of the Software, and to permit persons to whom the Software is | |||
| // furnished to do so, subject to the following conditions: | |||
| // The above copyright notice and this permission notice shall be included in all | |||
| // copies or substantial portions of the Software. | |||
| // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
| // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||
| // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||
| // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||
| // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||
| // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
| // SOFTWARE. | |||
| package geo | |||
| import ( | |||
| "math" | |||
| ) | |||
| // encoding encapsulates an encoding defined by a given base32 alphabet. | |||
| type encoding struct { | |||
| enc string | |||
| dec [256]byte | |||
| } | |||
| // newEncoding constructs a new encoding defined by the given alphabet, | |||
| // which must be a 32-byte string. | |||
| func newEncoding(encoder string) *encoding { | |||
| e := new(encoding) | |||
| e.enc = encoder | |||
| for i := 0; i < len(e.dec); i++ { | |||
| e.dec[i] = 0xff | |||
| } | |||
| for i := 0; i < len(encoder); i++ { | |||
| e.dec[encoder[i]] = byte(i) | |||
| } | |||
| return e | |||
| } | |||
| // Decode string into bits of a 64-bit word. The string s may be at most 12 | |||
| // characters. | |||
| func (e *encoding) decode(s string) uint64 { | |||
| x := uint64(0) | |||
| for i := 0; i < len(s); i++ { | |||
| x = (x << 5) | uint64(e.dec[s[i]]) | |||
| } | |||
| return x | |||
| } | |||
| // Encode bits of 64-bit word into a string. | |||
| func (e *encoding) encode(x uint64) string { | |||
| b := [12]byte{} | |||
| for i := 0; i < 12; i++ { | |||
| b[11-i] = e.enc[x&0x1f] | |||
| x >>= 5 | |||
| } | |||
| return string(b[:]) | |||
| } | |||
| // Base32Encoding with the Geohash alphabet. | |||
| var base32encoding = newEncoding("0123456789bcdefghjkmnpqrstuvwxyz") | |||
| // BoundingBox returns the region encoded by the given string geohash. | |||
| func geoBoundingBox(hash string) geoBox { | |||
| bits := uint(5 * len(hash)) | |||
| inthash := base32encoding.decode(hash) | |||
| return geoBoundingBoxIntWithPrecision(inthash, bits) | |||
| } | |||
| // Box represents a rectangle in latitude/longitude space. | |||
| type geoBox struct { | |||
| minLat float64 | |||
| maxLat float64 | |||
| minLng float64 | |||
| maxLng float64 | |||
| } | |||
| // Round returns a point inside the box, making an effort to round to minimal | |||
| // precision. | |||
| func (b geoBox) round() (lat, lng float64) { | |||
| x := maxDecimalPower(b.maxLat - b.minLat) | |||
| lat = math.Ceil(b.minLat/x) * x | |||
| x = maxDecimalPower(b.maxLng - b.minLng) | |||
| lng = math.Ceil(b.minLng/x) * x | |||
| return | |||
| } | |||
| // precalculated for performance | |||
| var exp232 = math.Exp2(32) | |||
| // errorWithPrecision returns the error range in latitude and longitude for in | |||
| // integer geohash with bits of precision. | |||
| func errorWithPrecision(bits uint) (latErr, lngErr float64) { | |||
| b := int(bits) | |||
| latBits := b / 2 | |||
| lngBits := b - latBits | |||
| latErr = math.Ldexp(180.0, -latBits) | |||
| lngErr = math.Ldexp(360.0, -lngBits) | |||
| return | |||
| } | |||
| // minDecimalPlaces returns the minimum number of decimal places such that | |||
| // there must exist an number with that many places within any range of width | |||
| // r. This is intended for returning minimal precision coordinates inside a | |||
| // box. | |||
| func maxDecimalPower(r float64) float64 { | |||
| m := int(math.Floor(math.Log10(r))) | |||
| return math.Pow10(m) | |||
| } | |||
| // Encode the position of x within the range -r to +r as a 32-bit integer. | |||
| func encodeRange(x, r float64) uint32 { | |||
| p := (x + r) / (2 * r) | |||
| return uint32(p * exp232) | |||
| } | |||
| // Decode the 32-bit range encoding X back to a value in the range -r to +r. | |||
| func decodeRange(X uint32, r float64) float64 { | |||
| p := float64(X) / exp232 | |||
| x := 2*r*p - r | |||
| return x | |||
| } | |||
| // Squash the even bitlevels of X into a 32-bit word. Odd bitlevels of X are | |||
| // ignored, and may take any value. | |||
| func squash(X uint64) uint32 { | |||
| X &= 0x5555555555555555 | |||
| X = (X | (X >> 1)) & 0x3333333333333333 | |||
| X = (X | (X >> 2)) & 0x0f0f0f0f0f0f0f0f | |||
| X = (X | (X >> 4)) & 0x00ff00ff00ff00ff | |||
| X = (X | (X >> 8)) & 0x0000ffff0000ffff | |||
| X = (X | (X >> 16)) & 0x00000000ffffffff | |||
| return uint32(X) | |||
| } | |||
| // Deinterleave the bits of X into 32-bit words containing the even and odd | |||
| // bitlevels of X, respectively. | |||
| func deinterleave(X uint64) (uint32, uint32) { | |||
| return squash(X), squash(X >> 1) | |||
| } | |||
| // BoundingBoxIntWithPrecision returns the region encoded by the integer | |||
| // geohash with the specified precision. | |||
| func geoBoundingBoxIntWithPrecision(hash uint64, bits uint) geoBox { | |||
| fullHash := hash << (64 - bits) | |||
| latInt, lngInt := deinterleave(fullHash) | |||
| lat := decodeRange(latInt, 90) | |||
| lng := decodeRange(lngInt, 180) | |||
| latErr, lngErr := errorWithPrecision(bits) | |||
| return geoBox{ | |||
| minLat: lat, | |||
| maxLat: lat + latErr, | |||
| minLng: lng, | |||
| maxLng: lng + lngErr, | |||
| } | |||
| } | |||
| // ---------------------------------------------------------------------- | |||
| // Decode the string geohash to a (lat, lng) point. | |||
| func GeoHashDecode(hash string) (lat, lng float64) { | |||
| box := geoBoundingBox(hash) | |||
| return box.round() | |||
| } | |||
| @@ -16,6 +16,7 @@ package geo | |||
| import ( | |||
| "reflect" | |||
| "strconv" | |||
| "strings" | |||
| ) | |||
| @@ -24,6 +25,8 @@ import ( | |||
| // Container: | |||
| // slice length 2 (GeoJSON) | |||
| // first element lon, second element lat | |||
| // string (coordinates separated by comma, or a geohash) | |||
| // first element lat, second element lon | |||
| // map[string]interface{} | |||
| // exact keys lat and lon or lng | |||
| // struct | |||
| @@ -36,10 +39,14 @@ func ExtractGeoPoint(thing interface{}) (lon, lat float64, success bool) { | |||
| var foundLon, foundLat bool | |||
| thingVal := reflect.ValueOf(thing) | |||
| if !thingVal.IsValid() { | |||
| return lon, lat, false | |||
| } | |||
| thingTyp := thingVal.Type() | |||
| // is it a slice | |||
| if thingVal.IsValid() && thingVal.Kind() == reflect.Slice { | |||
| if thingVal.Kind() == reflect.Slice { | |||
| // must be length 2 | |||
| if thingVal.Len() == 2 { | |||
| first := thingVal.Index(0) | |||
| @@ -55,6 +62,35 @@ func ExtractGeoPoint(thing interface{}) (lon, lat float64, success bool) { | |||
| } | |||
| } | |||
| // is it a string | |||
| if thingVal.Kind() == reflect.String { | |||
| geoStr := thingVal.Interface().(string) | |||
| if strings.Contains(geoStr, ",") { | |||
| // geo point with coordinates split by comma | |||
| points := strings.Split(geoStr, ",") | |||
| for i, point := range points { | |||
| // trim any leading or trailing white spaces | |||
| points[i] = strings.TrimSpace(point) | |||
| } | |||
| if len(points) == 2 { | |||
| var err error | |||
| lat, err = strconv.ParseFloat(points[0], 64) | |||
| if err == nil { | |||
| foundLat = true | |||
| } | |||
| lon, err = strconv.ParseFloat(points[1], 64) | |||
| if err == nil { | |||
| foundLon = true | |||
| } | |||
| } | |||
| } else { | |||
| // geohash | |||
| lat, lon = GeoHashDecode(geoStr) | |||
| foundLat = true | |||
| foundLon = true | |||
| } | |||
| } | |||
| // is it a map | |||
| if l, ok := thing.(map[string]interface{}); ok { | |||
| if lval, ok := l["lon"]; ok { | |||
| @@ -68,7 +104,7 @@ func ExtractGeoPoint(thing interface{}) (lon, lat float64, success bool) { | |||
| } | |||
| // now try reflection on struct fields | |||
| if thingVal.IsValid() && thingVal.Kind() == reflect.Struct { | |||
| if thingVal.Kind() == reflect.Struct { | |||
| for i := 0; i < thingVal.NumField(); i++ { | |||
| fieldName := thingTyp.Field(i).Name | |||
| if strings.HasPrefix(strings.ToLower(fieldName), "lon") { | |||
| @@ -113,6 +149,9 @@ func ExtractGeoPoint(thing interface{}) (lon, lat float64, success bool) { | |||
| // extract numeric value (if possible) and returns a float64 | |||
| func extractNumericVal(v interface{}) (float64, bool) { | |||
| val := reflect.ValueOf(v) | |||
| if !val.IsValid() { | |||
| return 0, false | |||
| } | |||
| typ := val.Type() | |||
| switch typ.Kind() { | |||
| case reflect.Float32, reflect.Float64: | |||
| @@ -21,6 +21,7 @@ import ( | |||
| "github.com/blevesearch/bleve/index" | |||
| "github.com/blevesearch/bleve/index/store" | |||
| "github.com/blevesearch/bleve/mapping" | |||
| "github.com/blevesearch/bleve/size" | |||
| ) | |||
| // A Batch groups together multiple Index and Delete | |||
| @@ -32,6 +33,9 @@ import ( | |||
| type Batch struct { | |||
| index Index | |||
| internal *index.Batch | |||
| lastDocSize uint64 | |||
| totalSize uint64 | |||
| } | |||
| // Index adds the specified index operation to the | |||
| @@ -47,9 +51,22 @@ func (b *Batch) Index(id string, data interface{}) error { | |||
| return err | |||
| } | |||
| b.internal.Update(doc) | |||
| b.lastDocSize = uint64(doc.Size() + | |||
| len(id) + size.SizeOfString) // overhead from internal | |||
| b.totalSize += b.lastDocSize | |||
| return nil | |||
| } | |||
| func (b *Batch) LastDocSize() uint64 { | |||
| return b.lastDocSize | |||
| } | |||
| func (b *Batch) TotalDocsSize() uint64 { | |||
| return b.totalSize | |||
| } | |||
| // IndexAdvanced adds the specified index operation to the | |||
| // batch which skips the mapping. NOTE: the bleve Index is not updated | |||
| // until the batch is executed. | |||
| @@ -102,6 +119,24 @@ func (b *Batch) Reset() { | |||
| b.internal.Reset() | |||
| } | |||
| func (b *Batch) Merge(o *Batch) { | |||
| if o != nil && o.internal != nil { | |||
| b.internal.Merge(o.internal) | |||
| if o.LastDocSize() > 0 { | |||
| b.lastDocSize = o.LastDocSize() | |||
| } | |||
| b.totalSize = uint64(b.internal.TotalDocSize()) | |||
| } | |||
| } | |||
| func (b *Batch) SetPersistedCallback(f index.BatchCallback) { | |||
| b.internal.SetPersistedCallback(f) | |||
| } | |||
| func (b *Batch) PersistedCallback() index.BatchCallback { | |||
| return b.internal.PersistedCallback() | |||
| } | |||
| // An Index implements all the indexing and searching | |||
| // capabilities of bleve. An Index can be created | |||
| // using the New() and Open() methods. | |||
| @@ -15,10 +15,20 @@ | |||
| package index | |||
| import ( | |||
| "reflect" | |||
| "github.com/blevesearch/bleve/analysis" | |||
| "github.com/blevesearch/bleve/document" | |||
| "github.com/blevesearch/bleve/size" | |||
| ) | |||
| var reflectStaticSizeAnalysisResult int | |||
| func init() { | |||
| var ar AnalysisResult | |||
| reflectStaticSizeAnalysisResult = int(reflect.TypeOf(ar).Size()) | |||
| } | |||
| type IndexRow interface { | |||
| KeySize() int | |||
| KeyTo([]byte) (int, error) | |||
| @@ -39,6 +49,15 @@ type AnalysisResult struct { | |||
| Length []int | |||
| } | |||
| func (a *AnalysisResult) Size() int { | |||
| rv := reflectStaticSizeAnalysisResult | |||
| for _, analyzedI := range a.Analyzed { | |||
| rv += analyzedI.Size() | |||
| } | |||
| rv += len(a.Length) * size.SizeOfInt | |||
| return rv | |||
| } | |||
| type AnalysisWork struct { | |||
| i Index | |||
| d *document.Document | |||
| @@ -18,11 +18,23 @@ import ( | |||
| "bytes" | |||
| "encoding/json" | |||
| "fmt" | |||
| "reflect" | |||
| "github.com/blevesearch/bleve/document" | |||
| "github.com/blevesearch/bleve/index/store" | |||
| "github.com/blevesearch/bleve/size" | |||
| ) | |||
| var reflectStaticSizeTermFieldDoc int | |||
| var reflectStaticSizeTermFieldVector int | |||
| func init() { | |||
| var tfd TermFieldDoc | |||
| reflectStaticSizeTermFieldDoc = int(reflect.TypeOf(tfd).Size()) | |||
| var tfv TermFieldVector | |||
| reflectStaticSizeTermFieldVector = int(reflect.TypeOf(tfv).Size()) | |||
| } | |||
| var ErrorUnknownStorageType = fmt.Errorf("unknown storage type") | |||
| type Index interface { | |||
| @@ -68,6 +80,8 @@ type IndexReader interface { | |||
| Document(id string) (*document.Document, error) | |||
| DocumentVisitFieldTerms(id IndexInternalID, fields []string, visitor DocumentFieldTermVisitor) error | |||
| DocValueReader(fields []string) (DocValueReader, error) | |||
| Fields() ([]string, error) | |||
| GetInternal(key []byte) ([]byte, error) | |||
| @@ -84,6 +98,29 @@ type IndexReader interface { | |||
| Close() error | |||
| } | |||
| // The Regexp interface defines the subset of the regexp.Regexp API | |||
| // methods that are used by bleve indexes, allowing callers to pass in | |||
| // alternate implementations. | |||
| type Regexp interface { | |||
| FindStringIndex(s string) (loc []int) | |||
| LiteralPrefix() (prefix string, complete bool) | |||
| String() string | |||
| } | |||
| type IndexReaderRegexp interface { | |||
| FieldDictRegexp(field string, regex string) (FieldDict, error) | |||
| } | |||
| type IndexReaderFuzzy interface { | |||
| FieldDictFuzzy(field string, term string, fuzziness int, prefix string) (FieldDict, error) | |||
| } | |||
| type IndexReaderOnly interface { | |||
| FieldDictOnly(field string, onlyTerms [][]byte, includeCount bool) (FieldDict, error) | |||
| } | |||
| // FieldTerms contains the terms used by a document, keyed by field | |||
| type FieldTerms map[string][]string | |||
| @@ -115,6 +152,11 @@ type TermFieldVector struct { | |||
| End uint64 | |||
| } | |||
| func (tfv *TermFieldVector) Size() int { | |||
| return reflectStaticSizeTermFieldVector + size.SizeOfPtr + | |||
| len(tfv.Field) + len(tfv.ArrayPositions)*size.SizeOfUint64 | |||
| } | |||
| // IndexInternalID is an opaque document identifier interal to the index impl | |||
| type IndexInternalID []byte | |||
| @@ -134,14 +176,27 @@ type TermFieldDoc struct { | |||
| Vectors []*TermFieldVector | |||
| } | |||
| func (tfd *TermFieldDoc) Size() int { | |||
| sizeInBytes := reflectStaticSizeTermFieldDoc + size.SizeOfPtr + | |||
| len(tfd.Term) + len(tfd.ID) | |||
| for _, entry := range tfd.Vectors { | |||
| sizeInBytes += entry.Size() | |||
| } | |||
| return sizeInBytes | |||
| } | |||
| // Reset allows an already allocated TermFieldDoc to be reused | |||
| func (tfd *TermFieldDoc) Reset() *TermFieldDoc { | |||
| // remember the []byte used for the ID | |||
| id := tfd.ID | |||
| vectors := tfd.Vectors | |||
| // idiom to copy over from empty TermFieldDoc (0 allocations) | |||
| *tfd = TermFieldDoc{} | |||
| // reuse the []byte already allocated (and reset len to 0) | |||
| tfd.ID = id[:0] | |||
| tfd.Vectors = vectors[:0] | |||
| return tfd | |||
| } | |||
| @@ -161,6 +216,8 @@ type TermFieldReader interface { | |||
| // Count returns the number of documents contains the term in this field. | |||
| Count() uint64 | |||
| Close() error | |||
| Size() int | |||
| } | |||
| type DictEntry struct { | |||
| @@ -185,12 +242,18 @@ type DocIDReader interface { | |||
| // will start there instead. If ID is greater than or equal to the end of | |||
| // the range, Next() call will return io.EOF. | |||
| Advance(ID IndexInternalID) (IndexInternalID, error) | |||
| Size() int | |||
| Close() error | |||
| } | |||
| type BatchCallback func(error) | |||
| type Batch struct { | |||
| IndexOps map[string]*document.Document | |||
| InternalOps map[string][]byte | |||
| IndexOps map[string]*document.Document | |||
| InternalOps map[string][]byte | |||
| persistedCallback BatchCallback | |||
| } | |||
| func NewBatch() *Batch { | |||
| @@ -216,6 +279,14 @@ func (b *Batch) DeleteInternal(key []byte) { | |||
| b.InternalOps[string(key)] = nil | |||
| } | |||
| func (b *Batch) SetPersistedCallback(f BatchCallback) { | |||
| b.persistedCallback = f | |||
| } | |||
| func (b *Batch) PersistedCallback() BatchCallback { | |||
| return b.persistedCallback | |||
| } | |||
| func (b *Batch) String() string { | |||
| rv := fmt.Sprintf("Batch (%d ops, %d internal ops)\n", len(b.IndexOps), len(b.InternalOps)) | |||
| for k, v := range b.IndexOps { | |||
| @@ -238,4 +309,53 @@ func (b *Batch) String() string { | |||
| func (b *Batch) Reset() { | |||
| b.IndexOps = make(map[string]*document.Document) | |||
| b.InternalOps = make(map[string][]byte) | |||
| b.persistedCallback = nil | |||
| } | |||
| func (b *Batch) Merge(o *Batch) { | |||
| for k, v := range o.IndexOps { | |||
| b.IndexOps[k] = v | |||
| } | |||
| for k, v := range o.InternalOps { | |||
| b.InternalOps[k] = v | |||
| } | |||
| } | |||
| func (b *Batch) TotalDocSize() int { | |||
| var s int | |||
| for k, v := range b.IndexOps { | |||
| if v != nil { | |||
| s += v.Size() + size.SizeOfString | |||
| } | |||
| s += len(k) | |||
| } | |||
| return s | |||
| } | |||
| // Optimizable represents an optional interface that implementable by | |||
| // optimizable resources (e.g., TermFieldReaders, Searchers). These | |||
| // optimizable resources are provided the same OptimizableContext | |||
| // instance, so that they can coordinate via dynamic interface | |||
| // casting. | |||
| type Optimizable interface { | |||
| Optimize(kind string, octx OptimizableContext) (OptimizableContext, error) | |||
| } | |||
| // Represents a result of optimization -- see the Finish() method. | |||
| type Optimized interface{} | |||
| type OptimizableContext interface { | |||
| // Once all the optimzable resources have been provided the same | |||
| // OptimizableContext instance, the optimization preparations are | |||
| // finished or completed via the Finish() method. | |||
| // | |||
| // Depending on the optimization being performed, the Finish() | |||
| // method might return a non-nil Optimized instance. For example, | |||
| // the Optimized instance might represent an optimized | |||
| // TermFieldReader instance. | |||
| Finish() (Optimized, error) | |||
| } | |||
| type DocValueReader interface { | |||
| VisitDocValues(id IndexInternalID, visitor DocumentFieldTermVisitor) error | |||
| } | |||
| @@ -19,7 +19,9 @@ import ( | |||
| "sync/atomic" | |||
| "github.com/RoaringBitmap/roaring" | |||
| "github.com/blevesearch/bleve/index" | |||
| "github.com/blevesearch/bleve/index/scorch/segment" | |||
| "github.com/blevesearch/bleve/index/scorch/segment/zap" | |||
| ) | |||
| type segmentIntroduction struct { | |||
| @@ -29,8 +31,14 @@ type segmentIntroduction struct { | |||
| ids []string | |||
| internal map[string][]byte | |||
| applied chan error | |||
| persisted chan error | |||
| applied chan error | |||
| persisted chan error | |||
| persistedCallback index.BatchCallback | |||
| } | |||
| type persistIntroduction struct { | |||
| persisted map[uint64]segment.Segment | |||
| applied notificationChan | |||
| } | |||
| type epochWatcher struct { | |||
| @@ -48,6 +56,8 @@ func (s *Scorch) mainLoop() { | |||
| var epochWatchers []*epochWatcher | |||
| OUTER: | |||
| for { | |||
| atomic.AddUint64(&s.stats.TotIntroduceLoop, 1) | |||
| select { | |||
| case <-s.closeCh: | |||
| break OUTER | |||
| @@ -64,6 +74,9 @@ OUTER: | |||
| continue OUTER | |||
| } | |||
| case persist := <-s.persists: | |||
| s.introducePersist(persist) | |||
| case revertTo := <-s.revertToSnapshots: | |||
| err := s.revertToSnapshot(revertTo) | |||
| if err != nil { | |||
| @@ -92,32 +105,38 @@ OUTER: | |||
| } | |||
| func (s *Scorch) introduceSegment(next *segmentIntroduction) error { | |||
| // acquire lock | |||
| s.rootLock.Lock() | |||
| atomic.AddUint64(&s.stats.TotIntroduceSegmentBeg, 1) | |||
| defer atomic.AddUint64(&s.stats.TotIntroduceSegmentEnd, 1) | |||
| s.rootLock.RLock() | |||
| root := s.root | |||
| root.AddRef() | |||
| s.rootLock.RUnlock() | |||
| defer func() { _ = root.DecRef() }() | |||
| nsegs := len(s.root.segment) | |||
| nsegs := len(root.segment) | |||
| // prepare new index snapshot | |||
| newSnapshot := &IndexSnapshot{ | |||
| parent: s, | |||
| segment: make([]*SegmentSnapshot, 0, nsegs+1), | |||
| offsets: make([]uint64, 0, nsegs+1), | |||
| internal: make(map[string][]byte, len(s.root.internal)), | |||
| epoch: s.nextSnapshotEpoch, | |||
| internal: make(map[string][]byte, len(root.internal)), | |||
| refs: 1, | |||
| creator: "introduceSegment", | |||
| } | |||
| s.nextSnapshotEpoch++ | |||
| // iterate through current segments | |||
| var running uint64 | |||
| for i := range s.root.segment { | |||
| var docsToPersistCount, memSegments, fileSegments uint64 | |||
| for i := range root.segment { | |||
| // see if optimistic work included this segment | |||
| delta, ok := next.obsoletes[s.root.segment[i].id] | |||
| delta, ok := next.obsoletes[root.segment[i].id] | |||
| if !ok { | |||
| var err error | |||
| delta, err = s.root.segment[i].segment.DocNumbers(next.ids) | |||
| delta, err = root.segment[i].segment.DocNumbers(next.ids) | |||
| if err != nil { | |||
| s.rootLock.Unlock() | |||
| next.applied <- fmt.Errorf("error computing doc numbers: %v", err) | |||
| close(next.applied) | |||
| _ = newSnapshot.DecRef() | |||
| @@ -126,43 +145,60 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { | |||
| } | |||
| newss := &SegmentSnapshot{ | |||
| id: s.root.segment[i].id, | |||
| segment: s.root.segment[i].segment, | |||
| cachedDocs: s.root.segment[i].cachedDocs, | |||
| id: root.segment[i].id, | |||
| segment: root.segment[i].segment, | |||
| cachedDocs: root.segment[i].cachedDocs, | |||
| creator: root.segment[i].creator, | |||
| } | |||
| // apply new obsoletions | |||
| if s.root.segment[i].deleted == nil { | |||
| if root.segment[i].deleted == nil { | |||
| newss.deleted = delta | |||
| } else { | |||
| newss.deleted = roaring.Or(s.root.segment[i].deleted, delta) | |||
| newss.deleted = roaring.Or(root.segment[i].deleted, delta) | |||
| } | |||
| if newss.deleted.IsEmpty() { | |||
| newss.deleted = nil | |||
| } | |||
| // check for live size before copying | |||
| if newss.LiveSize() > 0 { | |||
| newSnapshot.segment = append(newSnapshot.segment, newss) | |||
| s.root.segment[i].segment.AddRef() | |||
| root.segment[i].segment.AddRef() | |||
| newSnapshot.offsets = append(newSnapshot.offsets, running) | |||
| running += s.root.segment[i].Count() | |||
| running += newss.segment.Count() | |||
| } | |||
| if isMemorySegment(root.segment[i]) { | |||
| docsToPersistCount += root.segment[i].Count() | |||
| memSegments++ | |||
| } else { | |||
| fileSegments++ | |||
| } | |||
| } | |||
| atomic.StoreUint64(&s.stats.TotItemsToPersist, docsToPersistCount) | |||
| atomic.StoreUint64(&s.stats.TotMemorySegmentsAtRoot, memSegments) | |||
| atomic.StoreUint64(&s.stats.TotFileSegmentsAtRoot, fileSegments) | |||
| // append new segment, if any, to end of the new index snapshot | |||
| if next.data != nil { | |||
| newSegmentSnapshot := &SegmentSnapshot{ | |||
| id: next.id, | |||
| segment: next.data, // take ownership of next.data's ref-count | |||
| cachedDocs: &cachedDocs{cache: nil}, | |||
| creator: "introduceSegment", | |||
| } | |||
| newSnapshot.segment = append(newSnapshot.segment, newSegmentSnapshot) | |||
| newSnapshot.offsets = append(newSnapshot.offsets, running) | |||
| // increment numItemsIntroduced which tracks the number of items | |||
| // queued for persistence. | |||
| atomic.AddUint64(&s.stats.numItemsIntroduced, newSegmentSnapshot.Count()) | |||
| atomic.AddUint64(&s.stats.TotIntroducedItems, newSegmentSnapshot.Count()) | |||
| atomic.AddUint64(&s.stats.TotIntroducedSegmentsBatch, 1) | |||
| } | |||
| // copy old values | |||
| for key, oldVal := range s.root.internal { | |||
| for key, oldVal := range root.internal { | |||
| newSnapshot.internal[key] = oldVal | |||
| } | |||
| // set new values and apply deletes | |||
| @@ -173,12 +209,21 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { | |||
| delete(newSnapshot.internal, key) | |||
| } | |||
| } | |||
| newSnapshot.updateSize() | |||
| s.rootLock.Lock() | |||
| if next.persisted != nil { | |||
| s.rootPersisted = append(s.rootPersisted, next.persisted) | |||
| } | |||
| if next.persistedCallback != nil { | |||
| s.persistedCallbacks = append(s.persistedCallbacks, next.persistedCallback) | |||
| } | |||
| // swap in new index snapshot | |||
| newSnapshot.epoch = s.nextSnapshotEpoch | |||
| s.nextSnapshotEpoch++ | |||
| rootPrev := s.root | |||
| s.root = newSnapshot | |||
| atomic.StoreUint64(&s.stats.CurRootEpoch, s.root.epoch) | |||
| // release lock | |||
| s.rootLock.Unlock() | |||
| @@ -191,42 +236,113 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { | |||
| return nil | |||
| } | |||
| func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { | |||
| // acquire lock | |||
| func (s *Scorch) introducePersist(persist *persistIntroduction) { | |||
| atomic.AddUint64(&s.stats.TotIntroducePersistBeg, 1) | |||
| defer atomic.AddUint64(&s.stats.TotIntroducePersistEnd, 1) | |||
| s.rootLock.Lock() | |||
| root := s.root | |||
| root.AddRef() | |||
| nextSnapshotEpoch := s.nextSnapshotEpoch | |||
| s.nextSnapshotEpoch++ | |||
| s.rootLock.Unlock() | |||
| // prepare new index snapshot | |||
| currSize := len(s.root.segment) | |||
| newSize := currSize + 1 - len(nextMerge.old) | |||
| defer func() { _ = root.DecRef() }() | |||
| newIndexSnapshot := &IndexSnapshot{ | |||
| parent: s, | |||
| epoch: nextSnapshotEpoch, | |||
| segment: make([]*SegmentSnapshot, len(root.segment)), | |||
| offsets: make([]uint64, len(root.offsets)), | |||
| internal: make(map[string][]byte, len(root.internal)), | |||
| refs: 1, | |||
| creator: "introducePersist", | |||
| } | |||
| var docsToPersistCount, memSegments, fileSegments uint64 | |||
| for i, segmentSnapshot := range root.segment { | |||
| // see if this segment has been replaced | |||
| if replacement, ok := persist.persisted[segmentSnapshot.id]; ok { | |||
| newSegmentSnapshot := &SegmentSnapshot{ | |||
| id: segmentSnapshot.id, | |||
| segment: replacement, | |||
| deleted: segmentSnapshot.deleted, | |||
| cachedDocs: segmentSnapshot.cachedDocs, | |||
| creator: "introducePersist", | |||
| } | |||
| newIndexSnapshot.segment[i] = newSegmentSnapshot | |||
| delete(persist.persisted, segmentSnapshot.id) | |||
| // update items persisted incase of a new segment snapshot | |||
| atomic.AddUint64(&s.stats.TotPersistedItems, newSegmentSnapshot.Count()) | |||
| atomic.AddUint64(&s.stats.TotPersistedSegments, 1) | |||
| fileSegments++ | |||
| } else { | |||
| newIndexSnapshot.segment[i] = root.segment[i] | |||
| newIndexSnapshot.segment[i].segment.AddRef() | |||
| if isMemorySegment(root.segment[i]) { | |||
| docsToPersistCount += root.segment[i].Count() | |||
| memSegments++ | |||
| } else { | |||
| fileSegments++ | |||
| } | |||
| } | |||
| newIndexSnapshot.offsets[i] = root.offsets[i] | |||
| } | |||
| for k, v := range root.internal { | |||
| newIndexSnapshot.internal[k] = v | |||
| } | |||
| atomic.StoreUint64(&s.stats.TotItemsToPersist, docsToPersistCount) | |||
| atomic.StoreUint64(&s.stats.TotMemorySegmentsAtRoot, memSegments) | |||
| atomic.StoreUint64(&s.stats.TotFileSegmentsAtRoot, fileSegments) | |||
| newIndexSnapshot.updateSize() | |||
| s.rootLock.Lock() | |||
| rootPrev := s.root | |||
| s.root = newIndexSnapshot | |||
| atomic.StoreUint64(&s.stats.CurRootEpoch, s.root.epoch) | |||
| s.rootLock.Unlock() | |||
| // empty segments deletion | |||
| if nextMerge.new == nil { | |||
| newSize-- | |||
| if rootPrev != nil { | |||
| _ = rootPrev.DecRef() | |||
| } | |||
| close(persist.applied) | |||
| } | |||
| func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { | |||
| atomic.AddUint64(&s.stats.TotIntroduceMergeBeg, 1) | |||
| defer atomic.AddUint64(&s.stats.TotIntroduceMergeEnd, 1) | |||
| s.rootLock.RLock() | |||
| root := s.root | |||
| root.AddRef() | |||
| s.rootLock.RUnlock() | |||
| defer func() { _ = root.DecRef() }() | |||
| newSnapshot := &IndexSnapshot{ | |||
| parent: s, | |||
| segment: make([]*SegmentSnapshot, 0, newSize), | |||
| offsets: make([]uint64, 0, newSize), | |||
| internal: s.root.internal, | |||
| epoch: s.nextSnapshotEpoch, | |||
| internal: root.internal, | |||
| refs: 1, | |||
| creator: "introduceMerge", | |||
| } | |||
| s.nextSnapshotEpoch++ | |||
| // iterate through current segments | |||
| newSegmentDeleted := roaring.NewBitmap() | |||
| var running uint64 | |||
| for i := range s.root.segment { | |||
| segmentID := s.root.segment[i].id | |||
| var running, docsToPersistCount, memSegments, fileSegments uint64 | |||
| for i := range root.segment { | |||
| segmentID := root.segment[i].id | |||
| if segSnapAtMerge, ok := nextMerge.old[segmentID]; ok { | |||
| // this segment is going away, see if anything else was deleted since we started the merge | |||
| if segSnapAtMerge != nil && s.root.segment[i].deleted != nil { | |||
| if segSnapAtMerge != nil && root.segment[i].deleted != nil { | |||
| // assume all these deletes are new | |||
| deletedSince := s.root.segment[i].deleted | |||
| deletedSince := root.segment[i].deleted | |||
| // if we already knew about some of them, remove | |||
| if segSnapAtMerge.deleted != nil { | |||
| deletedSince = roaring.AndNot(s.root.segment[i].deleted, segSnapAtMerge.deleted) | |||
| deletedSince = roaring.AndNot(root.segment[i].deleted, segSnapAtMerge.deleted) | |||
| } | |||
| deletedSinceItr := deletedSince.Iterator() | |||
| for deletedSinceItr.HasNext() { | |||
| @@ -240,18 +356,25 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { | |||
| // segments left behind in old map after processing | |||
| // the root segments would be the obsolete segment set | |||
| delete(nextMerge.old, segmentID) | |||
| } else if s.root.segment[i].LiveSize() > 0 { | |||
| } else if root.segment[i].LiveSize() > 0 { | |||
| // this segment is staying | |||
| newSnapshot.segment = append(newSnapshot.segment, &SegmentSnapshot{ | |||
| id: s.root.segment[i].id, | |||
| segment: s.root.segment[i].segment, | |||
| deleted: s.root.segment[i].deleted, | |||
| cachedDocs: s.root.segment[i].cachedDocs, | |||
| id: root.segment[i].id, | |||
| segment: root.segment[i].segment, | |||
| deleted: root.segment[i].deleted, | |||
| cachedDocs: root.segment[i].cachedDocs, | |||
| creator: root.segment[i].creator, | |||
| }) | |||
| s.root.segment[i].segment.AddRef() | |||
| root.segment[i].segment.AddRef() | |||
| newSnapshot.offsets = append(newSnapshot.offsets, running) | |||
| running += s.root.segment[i].Count() | |||
| running += root.segment[i].segment.Count() | |||
| if isMemorySegment(root.segment[i]) { | |||
| docsToPersistCount += root.segment[i].Count() | |||
| memSegments++ | |||
| } else { | |||
| fileSegments++ | |||
| } | |||
| } | |||
| } | |||
| @@ -269,6 +392,7 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { | |||
| } | |||
| } | |||
| } | |||
| // In case where all the docs in the newly merged segment getting | |||
| // deleted by the time we reach here, can skip the introduction. | |||
| if nextMerge.new != nil && | |||
| @@ -279,15 +403,35 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { | |||
| segment: nextMerge.new, // take ownership for nextMerge.new's ref-count | |||
| deleted: newSegmentDeleted, | |||
| cachedDocs: &cachedDocs{cache: nil}, | |||
| creator: "introduceMerge", | |||
| }) | |||
| newSnapshot.offsets = append(newSnapshot.offsets, running) | |||
| atomic.AddUint64(&s.stats.TotIntroducedSegmentsMerge, 1) | |||
| switch nextMerge.new.(type) { | |||
| case *zap.SegmentBase: | |||
| docsToPersistCount += nextMerge.new.Count() - newSegmentDeleted.GetCardinality() | |||
| memSegments++ | |||
| case *zap.Segment: | |||
| fileSegments++ | |||
| } | |||
| } | |||
| atomic.StoreUint64(&s.stats.TotItemsToPersist, docsToPersistCount) | |||
| atomic.StoreUint64(&s.stats.TotMemorySegmentsAtRoot, memSegments) | |||
| atomic.StoreUint64(&s.stats.TotFileSegmentsAtRoot, fileSegments) | |||
| newSnapshot.AddRef() // 1 ref for the nextMerge.notify response | |||
| // swap in new segment | |||
| newSnapshot.updateSize() | |||
| s.rootLock.Lock() | |||
| // swap in new index snapshot | |||
| newSnapshot.epoch = s.nextSnapshotEpoch | |||
| s.nextSnapshotEpoch++ | |||
| rootPrev := s.root | |||
| s.root = newSnapshot | |||
| atomic.StoreUint64(&s.stats.CurRootEpoch, s.root.epoch) | |||
| // release lock | |||
| s.rootLock.Unlock() | |||
| @@ -301,6 +445,9 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { | |||
| } | |||
| func (s *Scorch) revertToSnapshot(revertTo *snapshotReversion) error { | |||
| atomic.AddUint64(&s.stats.TotIntroduceRevertBeg, 1) | |||
| defer atomic.AddUint64(&s.stats.TotIntroduceRevertEnd, 1) | |||
| if revertTo.snapshot == nil { | |||
| err := fmt.Errorf("Cannot revert to a nil snapshot") | |||
| revertTo.applied <- err | |||
| @@ -318,9 +465,11 @@ func (s *Scorch) revertToSnapshot(revertTo *snapshotReversion) error { | |||
| internal: revertTo.snapshot.internal, | |||
| epoch: s.nextSnapshotEpoch, | |||
| refs: 1, | |||
| creator: "revertToSnapshot", | |||
| } | |||
| s.nextSnapshotEpoch++ | |||
| var docsToPersistCount, memSegments, fileSegments uint64 | |||
| // iterate through segments | |||
| for i, segmentSnapshot := range revertTo.snapshot.segment { | |||
| newSnapshot.segment[i] = &SegmentSnapshot{ | |||
| @@ -328,21 +477,37 @@ func (s *Scorch) revertToSnapshot(revertTo *snapshotReversion) error { | |||
| segment: segmentSnapshot.segment, | |||
| deleted: segmentSnapshot.deleted, | |||
| cachedDocs: segmentSnapshot.cachedDocs, | |||
| creator: segmentSnapshot.creator, | |||
| } | |||
| newSnapshot.segment[i].segment.AddRef() | |||
| // remove segment from ineligibleForRemoval map | |||
| filename := zapFileName(segmentSnapshot.id) | |||
| delete(s.ineligibleForRemoval, filename) | |||
| if isMemorySegment(segmentSnapshot) { | |||
| docsToPersistCount += segmentSnapshot.Count() | |||
| memSegments++ | |||
| } else { | |||
| fileSegments++ | |||
| } | |||
| } | |||
| atomic.StoreUint64(&s.stats.TotItemsToPersist, docsToPersistCount) | |||
| atomic.StoreUint64(&s.stats.TotMemorySegmentsAtRoot, memSegments) | |||
| atomic.StoreUint64(&s.stats.TotFileSegmentsAtRoot, fileSegments) | |||
| if revertTo.persisted != nil { | |||
| s.rootPersisted = append(s.rootPersisted, revertTo.persisted) | |||
| } | |||
| newSnapshot.updateSize() | |||
| // swap in new snapshot | |||
| rootPrev := s.root | |||
| s.root = newSnapshot | |||
| atomic.StoreUint64(&s.stats.CurRootEpoch, s.root.epoch) | |||
| // release lock | |||
| s.rootLock.Unlock() | |||
| @@ -354,3 +519,12 @@ func (s *Scorch) revertToSnapshot(revertTo *snapshotReversion) error { | |||
| return nil | |||
| } | |||
| func isMemorySegment(s *SegmentSnapshot) bool { | |||
| switch s.segment.(type) { | |||
| case *zap.SegmentBase: | |||
| return true | |||
| default: | |||
| return false | |||
| } | |||
| } | |||
| @@ -15,9 +15,7 @@ | |||
| package scorch | |||
| import ( | |||
| "bytes" | |||
| "encoding/json" | |||
| "fmt" | |||
| "os" | |||
| "sync/atomic" | |||
| @@ -40,16 +38,20 @@ func (s *Scorch) mergerLoop() { | |||
| OUTER: | |||
| for { | |||
| atomic.AddUint64(&s.stats.TotFileMergeLoopBeg, 1) | |||
| select { | |||
| case <-s.closeCh: | |||
| break OUTER | |||
| default: | |||
| // check to see if there is a new snapshot to persist | |||
| s.rootLock.RLock() | |||
| s.rootLock.Lock() | |||
| ourSnapshot := s.root | |||
| ourSnapshot.AddRef() | |||
| s.rootLock.RUnlock() | |||
| atomic.StoreUint64(&s.iStats.mergeSnapshotSize, uint64(ourSnapshot.Size())) | |||
| atomic.StoreUint64(&s.iStats.mergeEpoch, ourSnapshot.epoch) | |||
| s.rootLock.Unlock() | |||
| if ourSnapshot.epoch != lastEpochMergePlanned { | |||
| startTime := time.Now() | |||
| @@ -57,12 +59,21 @@ OUTER: | |||
| // lets get started | |||
| err := s.planMergeAtSnapshot(ourSnapshot, mergePlannerOptions) | |||
| if err != nil { | |||
| atomic.StoreUint64(&s.iStats.mergeEpoch, 0) | |||
| if err == segment.ErrClosed { | |||
| // index has been closed | |||
| _ = ourSnapshot.DecRef() | |||
| break OUTER | |||
| } | |||
| s.fireAsyncError(fmt.Errorf("merging err: %v", err)) | |||
| _ = ourSnapshot.DecRef() | |||
| atomic.AddUint64(&s.stats.TotFileMergeLoopErr, 1) | |||
| continue OUTER | |||
| } | |||
| lastEpochMergePlanned = ourSnapshot.epoch | |||
| atomic.StoreUint64(&s.stats.LastMergedEpoch, ourSnapshot.epoch) | |||
| s.fireEvent(EventKindMergerProgress, time.Since(startTime)) | |||
| } | |||
| _ = ourSnapshot.DecRef() | |||
| @@ -88,7 +99,10 @@ OUTER: | |||
| case <-ew.notifyCh: | |||
| } | |||
| } | |||
| atomic.AddUint64(&s.stats.TotFileMergeLoopEnd, 1) | |||
| } | |||
| s.asyncTasks.Done() | |||
| } | |||
| @@ -105,6 +119,11 @@ func (s *Scorch) parseMergePlannerOptions() (*mergeplan.MergePlanOptions, | |||
| if err != nil { | |||
| return &mergePlannerOptions, err | |||
| } | |||
| err = mergeplan.ValidateMergePlannerOptions(&mergePlannerOptions) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| } | |||
| return &mergePlannerOptions, nil | |||
| } | |||
| @@ -119,32 +138,45 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, | |||
| } | |||
| } | |||
| atomic.AddUint64(&s.stats.TotFileMergePlan, 1) | |||
| // give this list to the planner | |||
| resultMergePlan, err := mergeplan.Plan(onlyZapSnapshots, options) | |||
| if err != nil { | |||
| atomic.AddUint64(&s.stats.TotFileMergePlanErr, 1) | |||
| return fmt.Errorf("merge planning err: %v", err) | |||
| } | |||
| if resultMergePlan == nil { | |||
| // nothing to do | |||
| atomic.AddUint64(&s.stats.TotFileMergePlanNone, 1) | |||
| return nil | |||
| } | |||
| atomic.AddUint64(&s.stats.TotFileMergePlanOk, 1) | |||
| atomic.AddUint64(&s.stats.TotFileMergePlanTasks, uint64(len(resultMergePlan.Tasks))) | |||
| // process tasks in serial for now | |||
| var notifications []chan *IndexSnapshot | |||
| for _, task := range resultMergePlan.Tasks { | |||
| if len(task.Segments) == 0 { | |||
| atomic.AddUint64(&s.stats.TotFileMergePlanTasksSegmentsEmpty, 1) | |||
| continue | |||
| } | |||
| atomic.AddUint64(&s.stats.TotFileMergePlanTasksSegments, uint64(len(task.Segments))) | |||
| oldMap := make(map[uint64]*SegmentSnapshot) | |||
| newSegmentID := atomic.AddUint64(&s.nextSegmentID, 1) | |||
| segmentsToMerge := make([]*zap.Segment, 0, len(task.Segments)) | |||
| docsToDrop := make([]*roaring.Bitmap, 0, len(task.Segments)) | |||
| for _, planSegment := range task.Segments { | |||
| if segSnapshot, ok := planSegment.(*SegmentSnapshot); ok { | |||
| oldMap[segSnapshot.id] = segSnapshot | |||
| if zapSeg, ok := segSnapshot.segment.(*zap.Segment); ok { | |||
| if segSnapshot.LiveSize() == 0 { | |||
| atomic.AddUint64(&s.stats.TotFileMergeSegmentsEmpty, 1) | |||
| oldMap[segSnapshot.id] = nil | |||
| } else { | |||
| segmentsToMerge = append(segmentsToMerge, zapSeg) | |||
| @@ -155,32 +187,53 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, | |||
| } | |||
| var oldNewDocNums map[uint64][]uint64 | |||
| var segment segment.Segment | |||
| var seg segment.Segment | |||
| if len(segmentsToMerge) > 0 { | |||
| filename := zapFileName(newSegmentID) | |||
| s.markIneligibleForRemoval(filename) | |||
| path := s.path + string(os.PathSeparator) + filename | |||
| newDocNums, err := zap.Merge(segmentsToMerge, docsToDrop, path, 1024) | |||
| fileMergeZapStartTime := time.Now() | |||
| atomic.AddUint64(&s.stats.TotFileMergeZapBeg, 1) | |||
| newDocNums, _, err := zap.Merge(segmentsToMerge, docsToDrop, path, | |||
| DefaultChunkFactor, s.closeCh, s) | |||
| atomic.AddUint64(&s.stats.TotFileMergeZapEnd, 1) | |||
| fileMergeZapTime := uint64(time.Since(fileMergeZapStartTime)) | |||
| atomic.AddUint64(&s.stats.TotFileMergeZapTime, fileMergeZapTime) | |||
| if atomic.LoadUint64(&s.stats.MaxFileMergeZapTime) < fileMergeZapTime { | |||
| atomic.StoreUint64(&s.stats.MaxFileMergeZapTime, fileMergeZapTime) | |||
| } | |||
| if err != nil { | |||
| s.unmarkIneligibleForRemoval(filename) | |||
| atomic.AddUint64(&s.stats.TotFileMergePlanTasksErr, 1) | |||
| if err == segment.ErrClosed { | |||
| return err | |||
| } | |||
| return fmt.Errorf("merging failed: %v", err) | |||
| } | |||
| segment, err = zap.Open(path) | |||
| seg, err = zap.Open(path) | |||
| if err != nil { | |||
| s.unmarkIneligibleForRemoval(filename) | |||
| atomic.AddUint64(&s.stats.TotFileMergePlanTasksErr, 1) | |||
| return err | |||
| } | |||
| oldNewDocNums = make(map[uint64][]uint64) | |||
| for i, segNewDocNums := range newDocNums { | |||
| oldNewDocNums[task.Segments[i].Id()] = segNewDocNums | |||
| } | |||
| atomic.AddUint64(&s.stats.TotFileMergeSegments, uint64(len(segmentsToMerge))) | |||
| } | |||
| sm := &segmentMerge{ | |||
| id: newSegmentID, | |||
| old: oldMap, | |||
| oldNewDocNums: oldNewDocNums, | |||
| new: segment, | |||
| new: seg, | |||
| notify: make(chan *IndexSnapshot, 1), | |||
| } | |||
| notifications = append(notifications, sm.notify) | |||
| @@ -188,21 +241,28 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, | |||
| // give it to the introducer | |||
| select { | |||
| case <-s.closeCh: | |||
| _ = segment.Close() | |||
| return nil | |||
| _ = seg.Close() | |||
| return segment.ErrClosed | |||
| case s.merges <- sm: | |||
| atomic.AddUint64(&s.stats.TotFileMergeIntroductions, 1) | |||
| } | |||
| atomic.AddUint64(&s.stats.TotFileMergePlanTasksDone, 1) | |||
| } | |||
| for _, notification := range notifications { | |||
| select { | |||
| case <-s.closeCh: | |||
| return nil | |||
| atomic.AddUint64(&s.stats.TotFileMergeIntroductionsSkipped, 1) | |||
| return segment.ErrClosed | |||
| case newSnapshot := <-notification: | |||
| atomic.AddUint64(&s.stats.TotFileMergeIntroductionsDone, 1) | |||
| if newSnapshot != nil { | |||
| _ = newSnapshot.DecRef() | |||
| } | |||
| } | |||
| } | |||
| return nil | |||
| } | |||
| @@ -219,44 +279,48 @@ type segmentMerge struct { | |||
| // into the root | |||
| func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot, | |||
| sbs []*zap.SegmentBase, sbsDrops []*roaring.Bitmap, sbsIndexes []int, | |||
| chunkFactor uint32) (uint64, *IndexSnapshot, uint64, error) { | |||
| var br bytes.Buffer | |||
| chunkFactor uint32) (*IndexSnapshot, uint64, error) { | |||
| atomic.AddUint64(&s.stats.TotMemMergeBeg, 1) | |||
| cr := zap.NewCountHashWriter(&br) | |||
| memMergeZapStartTime := time.Now() | |||
| newDocNums, numDocs, storedIndexOffset, fieldsIndexOffset, | |||
| docValueOffset, dictLocs, fieldsInv, fieldsMap, err := | |||
| zap.MergeToWriter(sbs, sbsDrops, chunkFactor, cr) | |||
| if err != nil { | |||
| return 0, nil, 0, err | |||
| } | |||
| sb, err := zap.InitSegmentBase(br.Bytes(), cr.Sum32(), chunkFactor, | |||
| fieldsMap, fieldsInv, numDocs, storedIndexOffset, fieldsIndexOffset, | |||
| docValueOffset, dictLocs) | |||
| if err != nil { | |||
| return 0, nil, 0, err | |||
| } | |||
| atomic.AddUint64(&s.stats.TotMemMergeZapBeg, 1) | |||
| newSegmentID := atomic.AddUint64(&s.nextSegmentID, 1) | |||
| filename := zapFileName(newSegmentID) | |||
| path := s.path + string(os.PathSeparator) + filename | |||
| err = zap.PersistSegmentBase(sb, path) | |||
| newDocNums, _, err := | |||
| zap.MergeSegmentBases(sbs, sbsDrops, path, chunkFactor, s.closeCh, s) | |||
| atomic.AddUint64(&s.stats.TotMemMergeZapEnd, 1) | |||
| memMergeZapTime := uint64(time.Since(memMergeZapStartTime)) | |||
| atomic.AddUint64(&s.stats.TotMemMergeZapTime, memMergeZapTime) | |||
| if atomic.LoadUint64(&s.stats.MaxMemMergeZapTime) < memMergeZapTime { | |||
| atomic.StoreUint64(&s.stats.MaxMemMergeZapTime, memMergeZapTime) | |||
| } | |||
| if err != nil { | |||
| return 0, nil, 0, err | |||
| atomic.AddUint64(&s.stats.TotMemMergeErr, 1) | |||
| return nil, 0, err | |||
| } | |||
| segment, err := zap.Open(path) | |||
| seg, err := zap.Open(path) | |||
| if err != nil { | |||
| return 0, nil, 0, err | |||
| atomic.AddUint64(&s.stats.TotMemMergeErr, 1) | |||
| return nil, 0, err | |||
| } | |||
| // update persisted stats | |||
| atomic.AddUint64(&s.stats.TotPersistedItems, seg.Count()) | |||
| atomic.AddUint64(&s.stats.TotPersistedSegments, 1) | |||
| sm := &segmentMerge{ | |||
| id: newSegmentID, | |||
| old: make(map[uint64]*SegmentSnapshot), | |||
| oldNewDocNums: make(map[uint64][]uint64), | |||
| new: segment, | |||
| new: seg, | |||
| notify: make(chan *IndexSnapshot, 1), | |||
| } | |||
| @@ -268,15 +332,21 @@ func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot, | |||
| select { // send to introducer | |||
| case <-s.closeCh: | |||
| _ = segment.DecRef() | |||
| return 0, nil, 0, nil // TODO: return ErrInterruptedClosed? | |||
| _ = seg.DecRef() | |||
| return nil, 0, segment.ErrClosed | |||
| case s.merges <- sm: | |||
| } | |||
| select { // wait for introduction to complete | |||
| case <-s.closeCh: | |||
| return 0, nil, 0, nil // TODO: return ErrInterruptedClosed? | |||
| return nil, 0, segment.ErrClosed | |||
| case newSnapshot := <-sm.notify: | |||
| return numDocs, newSnapshot, newSegmentID, nil | |||
| atomic.AddUint64(&s.stats.TotMemMergeSegments, uint64(len(sbs))) | |||
| atomic.AddUint64(&s.stats.TotMemMergeDone, 1) | |||
| return newSnapshot, newSegmentID, nil | |||
| } | |||
| } | |||
| func (s *Scorch) ReportBytesWritten(bytesWritten uint64) { | |||
| atomic.AddUint64(&s.stats.TotFileMergeWrittenBytes, bytesWritten) | |||
| } | |||
| @@ -18,6 +18,7 @@ | |||
| package mergeplan | |||
| import ( | |||
| "errors" | |||
| "fmt" | |||
| "math" | |||
| "sort" | |||
| @@ -115,7 +116,15 @@ func (o *MergePlanOptions) RaiseToFloorSegmentSize(s int64) int64 { | |||
| return o.FloorSegmentSize | |||
| } | |||
| // Suggested default options. | |||
| // MaxSegmentSizeLimit represents the maximum size of a segment, | |||
| // this limit comes with hit-1 optimisation/max encoding limit uint31. | |||
| const MaxSegmentSizeLimit = 1<<31 - 1 | |||
| // ErrMaxSegmentSizeTooLarge is returned when the size of the segment | |||
| // exceeds the MaxSegmentSizeLimit | |||
| var ErrMaxSegmentSizeTooLarge = errors.New("MaxSegmentSize exceeds the size limit") | |||
| // DefaultMergePlanOptions suggests the default options. | |||
| var DefaultMergePlanOptions = MergePlanOptions{ | |||
| MaxSegmentsPerTier: 10, | |||
| MaxSegmentSize: 5000000, | |||
| @@ -208,14 +217,14 @@ func plan(segmentsIn []Segment, o *MergePlanOptions) (*MergePlan, error) { | |||
| if len(roster) > 0 { | |||
| rosterScore := scoreSegments(roster, o) | |||
| if len(bestRoster) <= 0 || rosterScore < bestRosterScore { | |||
| if len(bestRoster) == 0 || rosterScore < bestRosterScore { | |||
| bestRoster = roster | |||
| bestRosterScore = rosterScore | |||
| } | |||
| } | |||
| } | |||
| if len(bestRoster) <= 0 { | |||
| if len(bestRoster) == 0 { | |||
| return rv, nil | |||
| } | |||
| @@ -367,3 +376,11 @@ func ToBarChart(prefix string, barMax int, segments []Segment, plan *MergePlan) | |||
| return strings.Join(rv, "\n") | |||
| } | |||
| // ValidateMergePlannerOptions validates the merge planner options | |||
| func ValidateMergePlannerOptions(options *MergePlanOptions) error { | |||
| if options.MaxSegmentSize > MaxSegmentSizeLimit { | |||
| return ErrMaxSegmentSizeTooLarge | |||
| } | |||
| return nil | |||
| } | |||
| @@ -0,0 +1,420 @@ | |||
| // Copyright (c) 2018 Couchbase, Inc. | |||
| // | |||
| // Licensed under the Apache License, Version 2.0 (the "License"); | |||
| // you may not use this file except in compliance with the License. | |||
| // You may obtain a copy of the License at | |||
| // | |||
| // http://www.apache.org/licenses/LICENSE-2.0 | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software | |||
| // distributed under the License is distributed on an "AS IS" BASIS, | |||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| // See the License for the specific language governing permissions and | |||
| // limitations under the License. | |||
| package scorch | |||
| import ( | |||
| "fmt" | |||
| "github.com/RoaringBitmap/roaring" | |||
| "github.com/blevesearch/bleve/index" | |||
| "github.com/blevesearch/bleve/index/scorch/segment" | |||
| "github.com/blevesearch/bleve/index/scorch/segment/zap" | |||
| ) | |||
| var OptimizeConjunction = true | |||
| var OptimizeConjunctionUnadorned = true | |||
| var OptimizeDisjunctionUnadorned = true | |||
| func (s *IndexSnapshotTermFieldReader) Optimize(kind string, | |||
| octx index.OptimizableContext) (index.OptimizableContext, error) { | |||
| if OptimizeConjunction && kind == "conjunction" { | |||
| return s.optimizeConjunction(octx) | |||
| } | |||
| if OptimizeConjunctionUnadorned && kind == "conjunction:unadorned" { | |||
| return s.optimizeConjunctionUnadorned(octx) | |||
| } | |||
| if OptimizeDisjunctionUnadorned && kind == "disjunction:unadorned" { | |||
| return s.optimizeDisjunctionUnadorned(octx) | |||
| } | |||
| return octx, nil | |||
| } | |||
| var OptimizeDisjunctionUnadornedMinChildCardinality = uint64(256) | |||
| // ---------------------------------------------------------------- | |||
| func (s *IndexSnapshotTermFieldReader) optimizeConjunction( | |||
| octx index.OptimizableContext) (index.OptimizableContext, error) { | |||
| if octx == nil { | |||
| octx = &OptimizeTFRConjunction{snapshot: s.snapshot} | |||
| } | |||
| o, ok := octx.(*OptimizeTFRConjunction) | |||
| if !ok { | |||
| return octx, nil | |||
| } | |||
| if o.snapshot != s.snapshot { | |||
| return nil, fmt.Errorf("tried to optimize conjunction across different snapshots") | |||
| } | |||
| o.tfrs = append(o.tfrs, s) | |||
| return o, nil | |||
| } | |||
| type OptimizeTFRConjunction struct { | |||
| snapshot *IndexSnapshot | |||
| tfrs []*IndexSnapshotTermFieldReader | |||
| } | |||
| func (o *OptimizeTFRConjunction) Finish() (index.Optimized, error) { | |||
| if len(o.tfrs) <= 1 { | |||
| return nil, nil | |||
| } | |||
| for i := range o.snapshot.segment { | |||
| itr0, ok := o.tfrs[0].iterators[i].(*zap.PostingsIterator) | |||
| if !ok || itr0.ActualBM == nil { | |||
| continue | |||
| } | |||
| itr1, ok := o.tfrs[1].iterators[i].(*zap.PostingsIterator) | |||
| if !ok || itr1.ActualBM == nil { | |||
| continue | |||
| } | |||
| bm := roaring.And(itr0.ActualBM, itr1.ActualBM) | |||
| for _, tfr := range o.tfrs[2:] { | |||
| itr, ok := tfr.iterators[i].(*zap.PostingsIterator) | |||
| if !ok || itr.ActualBM == nil { | |||
| continue | |||
| } | |||
| bm.And(itr.ActualBM) | |||
| } | |||
| // in this conjunction optimization, the postings iterators | |||
| // will all share the same AND'ed together actual bitmap. The | |||
| // regular conjunction searcher machinery will still be used, | |||
| // but the underlying bitmap will be smaller. | |||
| for _, tfr := range o.tfrs { | |||
| itr, ok := tfr.iterators[i].(*zap.PostingsIterator) | |||
| if ok && itr.ActualBM != nil { | |||
| itr.ActualBM = bm | |||
| itr.Actual = bm.Iterator() | |||
| } | |||
| } | |||
| } | |||
| return nil, nil | |||
| } | |||
| // ---------------------------------------------------------------- | |||
| // An "unadorned" conjunction optimization is appropriate when | |||
| // additional or subsidiary information like freq-norm's and | |||
| // term-vectors are not required, and instead only the internal-id's | |||
| // are needed. | |||
| func (s *IndexSnapshotTermFieldReader) optimizeConjunctionUnadorned( | |||
| octx index.OptimizableContext) (index.OptimizableContext, error) { | |||
| if octx == nil { | |||
| octx = &OptimizeTFRConjunctionUnadorned{snapshot: s.snapshot} | |||
| } | |||
| o, ok := octx.(*OptimizeTFRConjunctionUnadorned) | |||
| if !ok { | |||
| return nil, nil | |||
| } | |||
| if o.snapshot != s.snapshot { | |||
| return nil, fmt.Errorf("tried to optimize unadorned conjunction across different snapshots") | |||
| } | |||
| o.tfrs = append(o.tfrs, s) | |||
| return o, nil | |||
| } | |||
| type OptimizeTFRConjunctionUnadorned struct { | |||
| snapshot *IndexSnapshot | |||
| tfrs []*IndexSnapshotTermFieldReader | |||
| } | |||
| var OptimizeTFRConjunctionUnadornedTerm = []byte("<conjunction:unadorned>") | |||
| var OptimizeTFRConjunctionUnadornedField = "*" | |||
| // Finish of an unadorned conjunction optimization will compute a | |||
| // termFieldReader with an "actual" bitmap that represents the | |||
| // constituent bitmaps AND'ed together. This termFieldReader cannot | |||
| // provide any freq-norm or termVector associated information. | |||
| func (o *OptimizeTFRConjunctionUnadorned) Finish() (rv index.Optimized, err error) { | |||
| if len(o.tfrs) <= 1 { | |||
| return nil, nil | |||
| } | |||
| // We use an artificial term and field because the optimized | |||
| // termFieldReader can represent multiple terms and fields. | |||
| oTFR := &IndexSnapshotTermFieldReader{ | |||
| term: OptimizeTFRConjunctionUnadornedTerm, | |||
| field: OptimizeTFRConjunctionUnadornedField, | |||
| snapshot: o.snapshot, | |||
| iterators: make([]segment.PostingsIterator, len(o.snapshot.segment)), | |||
| segmentOffset: 0, | |||
| includeFreq: false, | |||
| includeNorm: false, | |||
| includeTermVectors: false, | |||
| } | |||
| var actualBMs []*roaring.Bitmap // Collected from regular posting lists. | |||
| OUTER: | |||
| for i := range o.snapshot.segment { | |||
| actualBMs = actualBMs[:0] | |||
| var docNum1HitLast uint64 | |||
| var docNum1HitLastOk bool | |||
| for _, tfr := range o.tfrs { | |||
| if _, ok := tfr.iterators[i].(*segment.EmptyPostingsIterator); ok { | |||
| // An empty postings iterator means the entire AND is empty. | |||
| oTFR.iterators[i] = segment.AnEmptyPostingsIterator | |||
| continue OUTER | |||
| } | |||
| itr, ok := tfr.iterators[i].(*zap.PostingsIterator) | |||
| if !ok { | |||
| // We optimize zap postings iterators only. | |||
| return nil, nil | |||
| } | |||
| // If the postings iterator is "1-hit" optimized, then we | |||
| // can perform several optimizations up-front here. | |||
| docNum1Hit, ok := itr.DocNum1Hit() | |||
| if ok { | |||
| if docNum1Hit == zap.DocNum1HitFinished { | |||
| // An empty docNum here means the entire AND is empty. | |||
| oTFR.iterators[i] = segment.AnEmptyPostingsIterator | |||
| continue OUTER | |||
| } | |||
| if docNum1HitLastOk && docNum1HitLast != docNum1Hit { | |||
| // The docNum1Hit doesn't match the previous | |||
| // docNum1HitLast, so the entire AND is empty. | |||
| oTFR.iterators[i] = segment.AnEmptyPostingsIterator | |||
| continue OUTER | |||
| } | |||
| docNum1HitLast = docNum1Hit | |||
| docNum1HitLastOk = true | |||
| continue | |||
| } | |||
| if itr.ActualBM == nil { | |||
| // An empty actual bitmap means the entire AND is empty. | |||
| oTFR.iterators[i] = segment.AnEmptyPostingsIterator | |||
| continue OUTER | |||
| } | |||
| // Collect the actual bitmap for more processing later. | |||
| actualBMs = append(actualBMs, itr.ActualBM) | |||
| } | |||
| if docNum1HitLastOk { | |||
| // We reach here if all the 1-hit optimized posting | |||
| // iterators had the same 1-hit docNum, so we can check if | |||
| // our collected actual bitmaps also have that docNum. | |||
| for _, bm := range actualBMs { | |||
| if !bm.Contains(uint32(docNum1HitLast)) { | |||
| // The docNum1Hit isn't in one of our actual | |||
| // bitmaps, so the entire AND is empty. | |||
| oTFR.iterators[i] = segment.AnEmptyPostingsIterator | |||
| continue OUTER | |||
| } | |||
| } | |||
| // The actual bitmaps and docNum1Hits all contain or have | |||
| // the same 1-hit docNum, so that's our AND'ed result. | |||
| oTFR.iterators[i], err = zap.PostingsIteratorFrom1Hit( | |||
| docNum1HitLast, zap.NormBits1Hit, false, false) | |||
| if err != nil { | |||
| return nil, nil | |||
| } | |||
| continue OUTER | |||
| } | |||
| if len(actualBMs) == 0 { | |||
| // If we've collected no actual bitmaps at this point, | |||
| // then the entire AND is empty. | |||
| oTFR.iterators[i] = segment.AnEmptyPostingsIterator | |||
| continue OUTER | |||
| } | |||
| if len(actualBMs) == 1 { | |||
| // If we've only 1 actual bitmap, then that's our result. | |||
| oTFR.iterators[i], err = zap.PostingsIteratorFromBitmap( | |||
| actualBMs[0], false, false) | |||
| if err != nil { | |||
| return nil, nil | |||
| } | |||
| continue OUTER | |||
| } | |||
| // Else, AND together our collected bitmaps as our result. | |||
| bm := roaring.And(actualBMs[0], actualBMs[1]) | |||
| for _, actualBM := range actualBMs[2:] { | |||
| bm.And(actualBM) | |||
| } | |||
| oTFR.iterators[i], err = zap.PostingsIteratorFromBitmap( | |||
| bm, false, false) | |||
| if err != nil { | |||
| return nil, nil | |||
| } | |||
| } | |||
| return oTFR, nil | |||
| } | |||
| // ---------------------------------------------------------------- | |||
| // An "unadorned" disjunction optimization is appropriate when | |||
| // additional or subsidiary information like freq-norm's and | |||
| // term-vectors are not required, and instead only the internal-id's | |||
| // are needed. | |||
| func (s *IndexSnapshotTermFieldReader) optimizeDisjunctionUnadorned( | |||
| octx index.OptimizableContext) (index.OptimizableContext, error) { | |||
| if octx == nil { | |||
| octx = &OptimizeTFRDisjunctionUnadorned{snapshot: s.snapshot} | |||
| } | |||
| o, ok := octx.(*OptimizeTFRDisjunctionUnadorned) | |||
| if !ok { | |||
| return nil, nil | |||
| } | |||
| if o.snapshot != s.snapshot { | |||
| return nil, fmt.Errorf("tried to optimize unadorned disjunction across different snapshots") | |||
| } | |||
| o.tfrs = append(o.tfrs, s) | |||
| return o, nil | |||
| } | |||
| type OptimizeTFRDisjunctionUnadorned struct { | |||
| snapshot *IndexSnapshot | |||
| tfrs []*IndexSnapshotTermFieldReader | |||
| } | |||
| var OptimizeTFRDisjunctionUnadornedTerm = []byte("<disjunction:unadorned>") | |||
| var OptimizeTFRDisjunctionUnadornedField = "*" | |||
| // Finish of an unadorned disjunction optimization will compute a | |||
| // termFieldReader with an "actual" bitmap that represents the | |||
| // constituent bitmaps OR'ed together. This termFieldReader cannot | |||
| // provide any freq-norm or termVector associated information. | |||
| func (o *OptimizeTFRDisjunctionUnadorned) Finish() (rv index.Optimized, err error) { | |||
| if len(o.tfrs) <= 1 { | |||
| return nil, nil | |||
| } | |||
| for i := range o.snapshot.segment { | |||
| var cMax uint64 | |||
| for _, tfr := range o.tfrs { | |||
| itr, ok := tfr.iterators[i].(*zap.PostingsIterator) | |||
| if !ok { | |||
| return nil, nil | |||
| } | |||
| if itr.ActualBM != nil { | |||
| c := itr.ActualBM.GetCardinality() | |||
| if cMax < c { | |||
| cMax = c | |||
| } | |||
| } | |||
| } | |||
| // Heuristic to skip the optimization if all the constituent | |||
| // bitmaps are too small, where the processing & resource | |||
| // overhead to create the OR'ed bitmap outweighs the benefit. | |||
| if cMax < OptimizeDisjunctionUnadornedMinChildCardinality { | |||
| return nil, nil | |||
| } | |||
| } | |||
| // We use an artificial term and field because the optimized | |||
| // termFieldReader can represent multiple terms and fields. | |||
| oTFR := &IndexSnapshotTermFieldReader{ | |||
| term: OptimizeTFRDisjunctionUnadornedTerm, | |||
| field: OptimizeTFRDisjunctionUnadornedField, | |||
| snapshot: o.snapshot, | |||
| iterators: make([]segment.PostingsIterator, len(o.snapshot.segment)), | |||
| segmentOffset: 0, | |||
| includeFreq: false, | |||
| includeNorm: false, | |||
| includeTermVectors: false, | |||
| } | |||
| var docNums []uint32 // Collected docNum's from 1-hit posting lists. | |||
| var actualBMs []*roaring.Bitmap // Collected from regular posting lists. | |||
| for i := range o.snapshot.segment { | |||
| docNums = docNums[:0] | |||
| actualBMs = actualBMs[:0] | |||
| for _, tfr := range o.tfrs { | |||
| itr, ok := tfr.iterators[i].(*zap.PostingsIterator) | |||
| if !ok { | |||
| return nil, nil | |||
| } | |||
| docNum, ok := itr.DocNum1Hit() | |||
| if ok { | |||
| docNums = append(docNums, uint32(docNum)) | |||
| continue | |||
| } | |||
| if itr.ActualBM != nil { | |||
| actualBMs = append(actualBMs, itr.ActualBM) | |||
| } | |||
| } | |||
| var bm *roaring.Bitmap | |||
| if len(actualBMs) > 2 { | |||
| bm = roaring.HeapOr(actualBMs...) | |||
| } else if len(actualBMs) == 2 { | |||
| bm = roaring.Or(actualBMs[0], actualBMs[1]) | |||
| } else if len(actualBMs) == 1 { | |||
| bm = actualBMs[0].Clone() | |||
| } | |||
| if bm == nil { | |||
| bm = roaring.New() | |||
| } | |||
| bm.AddMany(docNums) | |||
| oTFR.iterators[i], err = zap.PostingsIteratorFromBitmap(bm, false, false) | |||
| if err != nil { | |||
| return nil, nil | |||
| } | |||
| } | |||
| return oTFR, nil | |||
| } | |||
| @@ -16,9 +16,12 @@ package scorch | |||
| import ( | |||
| "bytes" | |||
| "encoding/binary" | |||
| "encoding/json" | |||
| "fmt" | |||
| "io/ioutil" | |||
| "log" | |||
| "math" | |||
| "os" | |||
| "path/filepath" | |||
| "strconv" | |||
| @@ -27,23 +30,57 @@ import ( | |||
| "time" | |||
| "github.com/RoaringBitmap/roaring" | |||
| "github.com/blevesearch/bleve/index" | |||
| "github.com/blevesearch/bleve/index/scorch/segment" | |||
| "github.com/blevesearch/bleve/index/scorch/segment/zap" | |||
| "github.com/boltdb/bolt" | |||
| bolt "github.com/etcd-io/bbolt" | |||
| ) | |||
| var DefaultChunkFactor uint32 = 1024 | |||
| // Arbitrary number, need to make it configurable. | |||
| // Lower values like 10/making persister really slow | |||
| // doesn't work well as it is creating more files to | |||
| // persist for in next persist iteration and spikes the # FDs. | |||
| // Ideal value should let persister also proceed at | |||
| // an optimum pace so that the merger can skip | |||
| // many intermediate snapshots. | |||
| // This needs to be based on empirical data. | |||
| // TODO - may need to revisit this approach/value. | |||
| var epochDistance = uint64(5) | |||
| // DefaultPersisterNapTimeMSec is kept to zero as this helps in direct | |||
| // persistence of segments with the default safe batch option. | |||
| // If the default safe batch option results in high number of | |||
| // files on disk, then users may initialise this configuration parameter | |||
| // with higher values so that the persister will nap a bit within it's | |||
| // work loop to favour better in-memory merging of segments to result | |||
| // in fewer segment files on disk. But that may come with an indexing | |||
| // performance overhead. | |||
| // Unsafe batch users are advised to override this to higher value | |||
| // for better performance especially with high data density. | |||
| var DefaultPersisterNapTimeMSec int = 0 // ms | |||
| // DefaultPersisterNapUnderNumFiles helps in controlling the pace of | |||
| // persister. At times of a slow merger progress with heavy file merging | |||
| // operations, its better to pace down the persister for letting the merger | |||
| // to catch up within a range defined by this parameter. | |||
| // Fewer files on disk (as per the merge plan) would result in keeping the | |||
| // file handle usage under limit, faster disk merger and a healthier index. | |||
| // Its been observed that such a loosely sync'ed introducer-persister-merger | |||
| // trio results in better overall performance. | |||
| var DefaultPersisterNapUnderNumFiles int = 1000 | |||
| var DefaultMemoryPressurePauseThreshold uint64 = math.MaxUint64 | |||
| type persisterOptions struct { | |||
| // PersisterNapTimeMSec controls the wait/delay injected into | |||
| // persistence workloop to improve the chances for | |||
| // a healthier and heavier in-memory merging | |||
| PersisterNapTimeMSec int | |||
| // PersisterNapTimeMSec > 0, and the number of files is less than | |||
| // PersisterNapUnderNumFiles, then the persister will sleep | |||
| // PersisterNapTimeMSec amount of time to improve the chances for | |||
| // a healthier and heavier in-memory merging | |||
| PersisterNapUnderNumFiles int | |||
| // MemoryPressurePauseThreshold let persister to have a better leeway | |||
| // for prudently performing the memory merge of segments on a memory | |||
| // pressure situation. Here the config value is an upper threshold | |||
| // for the number of paused application threads. The default value would | |||
| // be a very high number to always favour the merging of memory segments. | |||
| MemoryPressurePauseThreshold uint64 | |||
| } | |||
| type notificationChan chan struct{} | |||
| @@ -53,8 +90,17 @@ func (s *Scorch) persisterLoop() { | |||
| var persistWatchers []*epochWatcher | |||
| var lastPersistedEpoch, lastMergedEpoch uint64 | |||
| var ew *epochWatcher | |||
| po, err := s.parsePersisterOptions() | |||
| if err != nil { | |||
| s.fireAsyncError(fmt.Errorf("persisterOptions json parsing err: %v", err)) | |||
| s.asyncTasks.Done() | |||
| return | |||
| } | |||
| OUTER: | |||
| for { | |||
| atomic.AddUint64(&s.stats.TotPersistLoopBeg, 1) | |||
| select { | |||
| case <-s.closeCh: | |||
| break OUTER | |||
| @@ -65,11 +111,13 @@ OUTER: | |||
| if ew != nil && ew.epoch > lastMergedEpoch { | |||
| lastMergedEpoch = ew.epoch | |||
| } | |||
| persistWatchers = s.pausePersisterForMergerCatchUp(lastPersistedEpoch, | |||
| &lastMergedEpoch, persistWatchers) | |||
| lastMergedEpoch, persistWatchers = s.pausePersisterForMergerCatchUp(lastPersistedEpoch, | |||
| lastMergedEpoch, persistWatchers, po) | |||
| var ourSnapshot *IndexSnapshot | |||
| var ourPersisted []chan error | |||
| var ourPersistedCallbacks []index.BatchCallback | |||
| // check to see if there is a new snapshot to persist | |||
| s.rootLock.Lock() | |||
| @@ -78,13 +126,17 @@ OUTER: | |||
| ourSnapshot.AddRef() | |||
| ourPersisted = s.rootPersisted | |||
| s.rootPersisted = nil | |||
| ourPersistedCallbacks = s.persistedCallbacks | |||
| s.persistedCallbacks = nil | |||
| atomic.StoreUint64(&s.iStats.persistSnapshotSize, uint64(ourSnapshot.Size())) | |||
| atomic.StoreUint64(&s.iStats.persistEpoch, ourSnapshot.epoch) | |||
| } | |||
| s.rootLock.Unlock() | |||
| if ourSnapshot != nil { | |||
| startTime := time.Now() | |||
| err := s.persistSnapshot(ourSnapshot) | |||
| err := s.persistSnapshot(ourSnapshot, po) | |||
| for _, ch := range ourPersisted { | |||
| if err != nil { | |||
| ch <- err | |||
| @@ -92,10 +144,22 @@ OUTER: | |||
| close(ch) | |||
| } | |||
| if err != nil { | |||
| atomic.StoreUint64(&s.iStats.persistEpoch, 0) | |||
| if err == segment.ErrClosed { | |||
| // index has been closed | |||
| _ = ourSnapshot.DecRef() | |||
| break OUTER | |||
| } | |||
| s.fireAsyncError(fmt.Errorf("got err persisting snapshot: %v", err)) | |||
| _ = ourSnapshot.DecRef() | |||
| atomic.AddUint64(&s.stats.TotPersistLoopErr, 1) | |||
| continue OUTER | |||
| } | |||
| for i := range ourPersistedCallbacks { | |||
| ourPersistedCallbacks[i](err) | |||
| } | |||
| atomic.StoreUint64(&s.stats.LastPersistedEpoch, ourSnapshot.epoch) | |||
| lastPersistedEpoch = ourSnapshot.epoch | |||
| for _, ew := range persistWatchers { | |||
| @@ -115,6 +179,8 @@ OUTER: | |||
| s.fireEvent(EventKindPersisterProgress, time.Since(startTime)) | |||
| if changed { | |||
| s.removeOldData() | |||
| atomic.AddUint64(&s.stats.TotPersistLoopProgress, 1) | |||
| continue OUTER | |||
| } | |||
| } | |||
| @@ -133,17 +199,21 @@ OUTER: | |||
| s.removeOldData() // might as well cleanup while waiting | |||
| atomic.AddUint64(&s.stats.TotPersistLoopWait, 1) | |||
| select { | |||
| case <-s.closeCh: | |||
| break OUTER | |||
| case <-w.notifyCh: | |||
| // woken up, next loop should pick up work | |||
| continue OUTER | |||
| atomic.AddUint64(&s.stats.TotPersistLoopWaitNotified, 1) | |||
| case ew = <-s.persisterNotifier: | |||
| // if the watchers are already caught up then let them wait, | |||
| // else let them continue to do the catch up | |||
| persistWatchers = append(persistWatchers, ew) | |||
| } | |||
| atomic.AddUint64(&s.stats.TotPersistLoopEnd, 1) | |||
| } | |||
| } | |||
| @@ -160,38 +230,95 @@ func notifyMergeWatchers(lastPersistedEpoch uint64, | |||
| return watchersNext | |||
| } | |||
| func (s *Scorch) pausePersisterForMergerCatchUp(lastPersistedEpoch uint64, lastMergedEpoch *uint64, | |||
| persistWatchers []*epochWatcher) []*epochWatcher { | |||
| func (s *Scorch) pausePersisterForMergerCatchUp(lastPersistedEpoch uint64, lastMergedEpoch uint64, | |||
| persistWatchers []*epochWatcher, po *persisterOptions) (uint64, []*epochWatcher) { | |||
| // first, let the watchers proceed if they lag behind | |||
| persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers) | |||
| // check the merger lag by counting the segment files on disk, | |||
| // On finding fewer files on disk, persister takes a short pause | |||
| // for sufficient in-memory segments to pile up for the next | |||
| // memory merge cum persist loop. | |||
| // On finding too many files on disk, persister pause until the merger | |||
| // catches up to reduce the segment file count under the threshold. | |||
| // But if there is memory pressure, then skip this sleep maneuvers. | |||
| numFilesOnDisk, _ := s.diskFileStats() | |||
| if numFilesOnDisk < uint64(po.PersisterNapUnderNumFiles) && | |||
| po.PersisterNapTimeMSec > 0 && s.paused() == 0 { | |||
| select { | |||
| case <-s.closeCh: | |||
| case <-time.After(time.Millisecond * time.Duration(po.PersisterNapTimeMSec)): | |||
| atomic.AddUint64(&s.stats.TotPersisterNapPauseCompleted, 1) | |||
| case ew := <-s.persisterNotifier: | |||
| // unblock the merger in meantime | |||
| persistWatchers = append(persistWatchers, ew) | |||
| lastMergedEpoch = ew.epoch | |||
| persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers) | |||
| atomic.AddUint64(&s.stats.TotPersisterMergerNapBreak, 1) | |||
| } | |||
| return lastMergedEpoch, persistWatchers | |||
| } | |||
| OUTER: | |||
| // check for slow merger and await until the merger catch up | |||
| for lastPersistedEpoch > *lastMergedEpoch+epochDistance { | |||
| for po.PersisterNapUnderNumFiles > 0 && | |||
| numFilesOnDisk >= uint64(po.PersisterNapUnderNumFiles) && | |||
| lastMergedEpoch < lastPersistedEpoch { | |||
| atomic.AddUint64(&s.stats.TotPersisterSlowMergerPause, 1) | |||
| select { | |||
| case <-s.closeCh: | |||
| break OUTER | |||
| case ew := <-s.persisterNotifier: | |||
| persistWatchers = append(persistWatchers, ew) | |||
| *lastMergedEpoch = ew.epoch | |||
| lastMergedEpoch = ew.epoch | |||
| } | |||
| atomic.AddUint64(&s.stats.TotPersisterSlowMergerResume, 1) | |||
| // let the watchers proceed if they lag behind | |||
| persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers) | |||
| numFilesOnDisk, _ = s.diskFileStats() | |||
| } | |||
| return persistWatchers | |||
| return lastMergedEpoch, persistWatchers | |||
| } | |||
| func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error { | |||
| persisted, err := s.persistSnapshotMaybeMerge(snapshot) | |||
| if err != nil { | |||
| return err | |||
| func (s *Scorch) parsePersisterOptions() (*persisterOptions, error) { | |||
| po := persisterOptions{ | |||
| PersisterNapTimeMSec: DefaultPersisterNapTimeMSec, | |||
| PersisterNapUnderNumFiles: DefaultPersisterNapUnderNumFiles, | |||
| MemoryPressurePauseThreshold: DefaultMemoryPressurePauseThreshold, | |||
| } | |||
| if persisted { | |||
| return nil | |||
| if v, ok := s.config["scorchPersisterOptions"]; ok { | |||
| b, err := json.Marshal(v) | |||
| if err != nil { | |||
| return &po, err | |||
| } | |||
| err = json.Unmarshal(b, &po) | |||
| if err != nil { | |||
| return &po, err | |||
| } | |||
| } | |||
| return &po, nil | |||
| } | |||
| func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot, | |||
| po *persisterOptions) error { | |||
| // Perform in-memory segment merging only when the memory pressure is | |||
| // below the configured threshold, else the persister performs the | |||
| // direct persistence of segments. | |||
| if s.paused() < po.MemoryPressurePauseThreshold { | |||
| persisted, err := s.persistSnapshotMaybeMerge(snapshot) | |||
| if err != nil { | |||
| return err | |||
| } | |||
| if persisted { | |||
| return nil | |||
| } | |||
| } | |||
| return s.persistSnapshotDirect(snapshot) | |||
| @@ -224,7 +351,7 @@ func (s *Scorch) persistSnapshotMaybeMerge(snapshot *IndexSnapshot) ( | |||
| return false, nil | |||
| } | |||
| _, newSnapshot, newSegmentID, err := s.mergeSegmentBases( | |||
| newSnapshot, newSegmentID, err := s.mergeSegmentBases( | |||
| snapshot, sbs, sbsDrops, sbsIndexes, DefaultChunkFactor) | |||
| if err != nil { | |||
| return false, err | |||
| @@ -249,6 +376,7 @@ func (s *Scorch) persistSnapshotMaybeMerge(snapshot *IndexSnapshot) ( | |||
| segment: make([]*SegmentSnapshot, 0, len(snapshot.segment)), | |||
| internal: snapshot.internal, | |||
| epoch: snapshot.epoch, | |||
| creator: "persistSnapshotMaybeMerge", | |||
| } | |||
| // copy to the equiv the segments that weren't replaced | |||
| @@ -301,6 +429,22 @@ func (s *Scorch) persistSnapshotDirect(snapshot *IndexSnapshot) (err error) { | |||
| return err | |||
| } | |||
| // persist meta values | |||
| metaBucket, err := snapshotBucket.CreateBucketIfNotExists(boltMetaDataKey) | |||
| if err != nil { | |||
| return err | |||
| } | |||
| err = metaBucket.Put([]byte("type"), []byte(zap.Type)) | |||
| if err != nil { | |||
| return err | |||
| } | |||
| buf := make([]byte, binary.MaxVarintLen32) | |||
| binary.BigEndian.PutUint32(buf, zap.Version) | |||
| err = metaBucket.Put([]byte("version"), buf) | |||
| if err != nil { | |||
| return err | |||
| } | |||
| // persist internal values | |||
| internalBucket, err := snapshotBucket.CreateBucketIfNotExists(boltInternalKey) | |||
| if err != nil { | |||
| @@ -390,44 +534,21 @@ func (s *Scorch) persistSnapshotDirect(snapshot *IndexSnapshot) (err error) { | |||
| } | |||
| } | |||
| s.rootLock.Lock() | |||
| newIndexSnapshot := &IndexSnapshot{ | |||
| parent: s, | |||
| epoch: s.nextSnapshotEpoch, | |||
| segment: make([]*SegmentSnapshot, len(s.root.segment)), | |||
| offsets: make([]uint64, len(s.root.offsets)), | |||
| internal: make(map[string][]byte, len(s.root.internal)), | |||
| refs: 1, | |||
| } | |||
| s.nextSnapshotEpoch++ | |||
| for i, segmentSnapshot := range s.root.segment { | |||
| // see if this segment has been replaced | |||
| if replacement, ok := newSegments[segmentSnapshot.id]; ok { | |||
| newSegmentSnapshot := &SegmentSnapshot{ | |||
| id: segmentSnapshot.id, | |||
| segment: replacement, | |||
| deleted: segmentSnapshot.deleted, | |||
| cachedDocs: segmentSnapshot.cachedDocs, | |||
| } | |||
| newIndexSnapshot.segment[i] = newSegmentSnapshot | |||
| delete(newSegments, segmentSnapshot.id) | |||
| // update items persisted incase of a new segment snapshot | |||
| atomic.AddUint64(&s.stats.numItemsPersisted, newSegmentSnapshot.Count()) | |||
| } else { | |||
| newIndexSnapshot.segment[i] = s.root.segment[i] | |||
| newIndexSnapshot.segment[i].segment.AddRef() | |||
| } | |||
| newIndexSnapshot.offsets[i] = s.root.offsets[i] | |||
| persist := &persistIntroduction{ | |||
| persisted: newSegments, | |||
| applied: make(notificationChan), | |||
| } | |||
| for k, v := range s.root.internal { | |||
| newIndexSnapshot.internal[k] = v | |||
| select { | |||
| case <-s.closeCh: | |||
| return segment.ErrClosed | |||
| case s.persists <- persist: | |||
| } | |||
| rootPrev := s.root | |||
| s.root = newIndexSnapshot | |||
| s.rootLock.Unlock() | |||
| if rootPrev != nil { | |||
| _ = rootPrev.DecRef() | |||
| select { | |||
| case <-s.closeCh: | |||
| return segment.ErrClosed | |||
| case <-persist.applied: | |||
| } | |||
| } | |||
| @@ -462,6 +583,7 @@ var boltSnapshotsBucket = []byte{'s'} | |||
| var boltPathKey = []byte{'p'} | |||
| var boltDeletedKey = []byte{'d'} | |||
| var boltInternalKey = []byte{'i'} | |||
| var boltMetaDataKey = []byte{'m'} | |||
| func (s *Scorch) loadFromBolt() error { | |||
| return s.rootBolt.View(func(tx *bolt.Tx) error { | |||
| @@ -478,19 +600,19 @@ func (s *Scorch) loadFromBolt() error { | |||
| continue | |||
| } | |||
| if foundRoot { | |||
| s.eligibleForRemoval = append(s.eligibleForRemoval, snapshotEpoch) | |||
| s.AddEligibleForRemoval(snapshotEpoch) | |||
| continue | |||
| } | |||
| snapshot := snapshots.Bucket(k) | |||
| if snapshot == nil { | |||
| log.Printf("snapshot key, but bucket missing %x, continuing", k) | |||
| s.eligibleForRemoval = append(s.eligibleForRemoval, snapshotEpoch) | |||
| s.AddEligibleForRemoval(snapshotEpoch) | |||
| continue | |||
| } | |||
| indexSnapshot, err := s.loadSnapshot(snapshot) | |||
| if err != nil { | |||
| log.Printf("unable to load snapshot, %v, continuing", err) | |||
| s.eligibleForRemoval = append(s.eligibleForRemoval, snapshotEpoch) | |||
| s.AddEligibleForRemoval(snapshotEpoch) | |||
| continue | |||
| } | |||
| indexSnapshot.epoch = snapshotEpoch | |||
| @@ -500,13 +622,16 @@ func (s *Scorch) loadFromBolt() error { | |||
| return err | |||
| } | |||
| s.nextSegmentID++ | |||
| s.nextSnapshotEpoch = snapshotEpoch + 1 | |||
| s.rootLock.Lock() | |||
| if s.root != nil { | |||
| _ = s.root.DecRef() | |||
| } | |||
| s.nextSnapshotEpoch = snapshotEpoch + 1 | |||
| rootPrev := s.root | |||
| s.root = indexSnapshot | |||
| s.rootLock.Unlock() | |||
| if rootPrev != nil { | |||
| _ = rootPrev.DecRef() | |||
| } | |||
| foundRoot = true | |||
| } | |||
| return nil | |||
| @@ -524,7 +649,7 @@ func (s *Scorch) LoadSnapshot(epoch uint64) (rv *IndexSnapshot, err error) { | |||
| snapshotKey := segment.EncodeUvarintAscending(nil, epoch) | |||
| snapshot := snapshots.Bucket(snapshotKey) | |||
| if snapshot == nil { | |||
| return nil | |||
| return fmt.Errorf("snapshot with epoch: %v - doesn't exist", epoch) | |||
| } | |||
| rv, err = s.loadSnapshot(snapshot) | |||
| return err | |||
| @@ -536,12 +661,13 @@ func (s *Scorch) LoadSnapshot(epoch uint64) (rv *IndexSnapshot, err error) { | |||
| } | |||
| func (s *Scorch) loadSnapshot(snapshot *bolt.Bucket) (*IndexSnapshot, error) { | |||
| rv := &IndexSnapshot{ | |||
| parent: s, | |||
| internal: make(map[string][]byte), | |||
| refs: 1, | |||
| creator: "loadSnapshot", | |||
| } | |||
| var running uint64 | |||
| c := snapshot.Cursor() | |||
| for k, _ := c.First(); k != nil; k, _ = c.Next() { | |||
| @@ -556,7 +682,7 @@ func (s *Scorch) loadSnapshot(snapshot *bolt.Bucket) (*IndexSnapshot, error) { | |||
| _ = rv.DecRef() | |||
| return nil, err | |||
| } | |||
| } else { | |||
| } else if k[0] != boltMetaDataKey[0] { | |||
| segmentBucket := snapshot.Bucket(k) | |||
| if segmentBucket == nil { | |||
| _ = rv.DecRef() | |||
| @@ -577,6 +703,7 @@ func (s *Scorch) loadSnapshot(snapshot *bolt.Bucket) (*IndexSnapshot, error) { | |||
| running += segmentSnapshot.segment.Count() | |||
| } | |||
| } | |||
| return rv, nil | |||
| } | |||
| @@ -604,7 +731,9 @@ func (s *Scorch) loadSegment(segmentBucket *bolt.Bucket) (*SegmentSnapshot, erro | |||
| _ = segment.Close() | |||
| return nil, fmt.Errorf("error reading deleted bytes: %v", err) | |||
| } | |||
| rv.deleted = deletedBitmap | |||
| if !deletedBitmap.IsEmpty() { | |||
| rv.deleted = deletedBitmap | |||
| } | |||
| } | |||
| return rv, nil | |||
| @@ -643,14 +772,14 @@ func (s *Scorch) removeOldBoltSnapshots() (numRemoved int, err error) { | |||
| return 0, err | |||
| } | |||
| if len(persistedEpochs) <= NumSnapshotsToKeep { | |||
| if len(persistedEpochs) <= s.numSnapshotsToKeep { | |||
| // we need to keep everything | |||
| return 0, nil | |||
| } | |||
| // make a map of epochs to protect from deletion | |||
| protectedEpochs := make(map[uint64]struct{}, NumSnapshotsToKeep) | |||
| for _, epoch := range persistedEpochs[0:NumSnapshotsToKeep] { | |||
| protectedEpochs := make(map[uint64]struct{}, s.numSnapshotsToKeep) | |||
| for _, epoch := range persistedEpochs[0:s.numSnapshotsToKeep] { | |||
| protectedEpochs[epoch] = struct{}{} | |||
| } | |||
| @@ -668,7 +797,7 @@ func (s *Scorch) removeOldBoltSnapshots() (numRemoved int, err error) { | |||
| s.eligibleForRemoval = newEligible | |||
| s.rootLock.Unlock() | |||
| if len(epochsToRemove) <= 0 { | |||
| if len(epochsToRemove) == 0 { | |||
| return 0, nil | |||
| } | |||
| @@ -1,110 +0,0 @@ | |||
| // Copyright (c) 2017 Couchbase, Inc. | |||
| // | |||
| // Licensed under the Apache License, Version 2.0 (the "License"); | |||
| // you may not use this file except in compliance with the License. | |||
| // You may obtain a copy of the License at | |||
| // | |||
| // http://www.apache.org/licenses/LICENSE-2.0 | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software | |||
| // distributed under the License is distributed on an "AS IS" BASIS, | |||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| // See the License for the specific language governing permissions and | |||
| // limitations under the License. | |||
| package scorch | |||
| import ( | |||
| "github.com/blevesearch/bleve/document" | |||
| "github.com/blevesearch/bleve/index" | |||
| ) | |||
| type Reader struct { | |||
| root *IndexSnapshot // Owns 1 ref-count on the index snapshot. | |||
| } | |||
| func (r *Reader) TermFieldReader(term []byte, field string, includeFreq, | |||
| includeNorm, includeTermVectors bool) (index.TermFieldReader, error) { | |||
| return r.root.TermFieldReader(term, field, includeFreq, includeNorm, includeTermVectors) | |||
| } | |||
| // DocIDReader returns an iterator over all doc ids | |||
| // The caller must close returned instance to release associated resources. | |||
| func (r *Reader) DocIDReaderAll() (index.DocIDReader, error) { | |||
| return r.root.DocIDReaderAll() | |||
| } | |||
| func (r *Reader) DocIDReaderOnly(ids []string) (index.DocIDReader, error) { | |||
| return r.root.DocIDReaderOnly(ids) | |||
| } | |||
| func (r *Reader) FieldDict(field string) (index.FieldDict, error) { | |||
| return r.root.FieldDict(field) | |||
| } | |||
| // FieldDictRange is currently defined to include the start and end terms | |||
| func (r *Reader) FieldDictRange(field string, startTerm []byte, | |||
| endTerm []byte) (index.FieldDict, error) { | |||
| return r.root.FieldDictRange(field, startTerm, endTerm) | |||
| } | |||
| func (r *Reader) FieldDictPrefix(field string, | |||
| termPrefix []byte) (index.FieldDict, error) { | |||
| return r.root.FieldDictPrefix(field, termPrefix) | |||
| } | |||
| func (r *Reader) Document(id string) (*document.Document, error) { | |||
| return r.root.Document(id) | |||
| } | |||
| func (r *Reader) DocumentVisitFieldTerms(id index.IndexInternalID, fields []string, | |||
| visitor index.DocumentFieldTermVisitor) error { | |||
| return r.root.DocumentVisitFieldTerms(id, fields, visitor) | |||
| } | |||
| func (r *Reader) Fields() ([]string, error) { | |||
| return r.root.Fields() | |||
| } | |||
| func (r *Reader) GetInternal(key []byte) ([]byte, error) { | |||
| return r.root.GetInternal(key) | |||
| } | |||
| func (r *Reader) DocCount() (uint64, error) { | |||
| return r.root.DocCount() | |||
| } | |||
| func (r *Reader) ExternalID(id index.IndexInternalID) (string, error) { | |||
| return r.root.ExternalID(id) | |||
| } | |||
| func (r *Reader) InternalID(id string) (index.IndexInternalID, error) { | |||
| return r.root.InternalID(id) | |||
| } | |||
| func (r *Reader) DumpAll() chan interface{} { | |||
| rv := make(chan interface{}) | |||
| go func() { | |||
| close(rv) | |||
| }() | |||
| return rv | |||
| } | |||
| func (r *Reader) DumpDoc(id string) chan interface{} { | |||
| rv := make(chan interface{}) | |||
| go func() { | |||
| close(rv) | |||
| }() | |||
| return rv | |||
| } | |||
| func (r *Reader) DumpFields() chan interface{} { | |||
| rv := make(chan interface{}) | |||
| go func() { | |||
| close(rv) | |||
| }() | |||
| return rv | |||
| } | |||
| func (r *Reader) Close() error { | |||
| return r.root.DecRef() | |||
| } | |||
| @@ -17,6 +17,7 @@ package scorch | |||
| import ( | |||
| "encoding/json" | |||
| "fmt" | |||
| "io/ioutil" | |||
| "os" | |||
| "sync" | |||
| "sync/atomic" | |||
| @@ -27,23 +28,24 @@ import ( | |||
| "github.com/blevesearch/bleve/document" | |||
| "github.com/blevesearch/bleve/index" | |||
| "github.com/blevesearch/bleve/index/scorch/segment" | |||
| "github.com/blevesearch/bleve/index/scorch/segment/mem" | |||
| "github.com/blevesearch/bleve/index/scorch/segment/zap" | |||
| "github.com/blevesearch/bleve/index/store" | |||
| "github.com/blevesearch/bleve/registry" | |||
| "github.com/boltdb/bolt" | |||
| bolt "github.com/etcd-io/bbolt" | |||
| ) | |||
| const Name = "scorch" | |||
| const Version uint8 = 1 | |||
| const Version uint8 = 2 | |||
| var ErrClosed = fmt.Errorf("scorch closed") | |||
| type Scorch struct { | |||
| readOnly bool | |||
| version uint8 | |||
| config map[string]interface{} | |||
| analysisQueue *index.AnalysisQueue | |||
| stats *Stats | |||
| stats Stats | |||
| nextSegmentID uint64 | |||
| path string | |||
| @@ -52,12 +54,15 @@ type Scorch struct { | |||
| rootLock sync.RWMutex | |||
| root *IndexSnapshot // holds 1 ref-count on the root | |||
| rootPersisted []chan error // closed when root is persisted | |||
| persistedCallbacks []index.BatchCallback | |||
| nextSnapshotEpoch uint64 | |||
| eligibleForRemoval []uint64 // Index snapshot epochs that are safe to GC. | |||
| ineligibleForRemoval map[string]bool // Filenames that should not be GC'ed yet. | |||
| numSnapshotsToKeep int | |||
| closeCh chan struct{} | |||
| introductions chan *segmentIntroduction | |||
| persists chan *persistIntroduction | |||
| merges chan *segmentMerge | |||
| introducerNotifier chan *epochWatcher | |||
| revertToSnapshots chan *snapshotReversion | |||
| @@ -67,6 +72,23 @@ type Scorch struct { | |||
| onEvent func(event Event) | |||
| onAsyncError func(err error) | |||
| iStats internalStats | |||
| pauseLock sync.RWMutex | |||
| pauseCount uint64 | |||
| } | |||
| type internalStats struct { | |||
| persistEpoch uint64 | |||
| persistSnapshotSize uint64 | |||
| mergeEpoch uint64 | |||
| mergeSnapshotSize uint64 | |||
| newSegBufBytesAdded uint64 | |||
| newSegBufBytesRemoved uint64 | |||
| analysisBytesAdded uint64 | |||
| analysisBytesRemoved uint64 | |||
| } | |||
| func NewScorch(storeName string, | |||
| @@ -80,8 +102,7 @@ func NewScorch(storeName string, | |||
| closeCh: make(chan struct{}), | |||
| ineligibleForRemoval: map[string]bool{}, | |||
| } | |||
| rv.stats = &Stats{i: rv} | |||
| rv.root = &IndexSnapshot{parent: rv, refs: 1} | |||
| rv.root = &IndexSnapshot{parent: rv, refs: 1, creator: "NewScorch"} | |||
| ro, ok := config["read_only"].(bool) | |||
| if ok { | |||
| rv.readOnly = ro | |||
| @@ -101,9 +122,30 @@ func NewScorch(storeName string, | |||
| return rv, nil | |||
| } | |||
| func (s *Scorch) paused() uint64 { | |||
| s.pauseLock.Lock() | |||
| pc := s.pauseCount | |||
| s.pauseLock.Unlock() | |||
| return pc | |||
| } | |||
| func (s *Scorch) incrPause() { | |||
| s.pauseLock.Lock() | |||
| s.pauseCount++ | |||
| s.pauseLock.Unlock() | |||
| } | |||
| func (s *Scorch) decrPause() { | |||
| s.pauseLock.Lock() | |||
| s.pauseCount-- | |||
| s.pauseLock.Unlock() | |||
| } | |||
| func (s *Scorch) fireEvent(kind EventKind, dur time.Duration) { | |||
| if s.onEvent != nil { | |||
| s.incrPause() | |||
| s.onEvent(Event{Kind: kind, Scorch: s, Duration: dur}) | |||
| s.decrPause() | |||
| } | |||
| } | |||
| @@ -111,6 +153,7 @@ func (s *Scorch) fireAsyncError(err error) { | |||
| if s.onAsyncError != nil { | |||
| s.onAsyncError(err) | |||
| } | |||
| atomic.AddUint64(&s.stats.TotOnErrors, 1) | |||
| } | |||
| func (s *Scorch) Open() error { | |||
| @@ -172,7 +215,10 @@ func (s *Scorch) openBolt() error { | |||
| } | |||
| } | |||
| atomic.StoreUint64(&s.stats.TotFileSegmentsAtRoot, uint64(len(s.root.segment))) | |||
| s.introductions = make(chan *segmentIntroduction) | |||
| s.persists = make(chan *persistIntroduction) | |||
| s.merges = make(chan *segmentMerge) | |||
| s.introducerNotifier = make(chan *epochWatcher, 1) | |||
| s.revertToSnapshots = make(chan *snapshotReversion) | |||
| @@ -186,6 +232,17 @@ func (s *Scorch) openBolt() error { | |||
| } | |||
| } | |||
| s.numSnapshotsToKeep = NumSnapshotsToKeep | |||
| if v, ok := s.config["numSnapshotsToKeep"]; ok { | |||
| var t int | |||
| if t, err = parseToInteger(v); err != nil { | |||
| return fmt.Errorf("numSnapshotsToKeep parse err: %v", err) | |||
| } | |||
| if t > 0 { | |||
| s.numSnapshotsToKeep = t | |||
| } | |||
| } | |||
| return nil | |||
| } | |||
| @@ -255,65 +312,83 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) { | |||
| // FIXME could sort ids list concurrent with analysis? | |||
| go func() { | |||
| for _, doc := range batch.IndexOps { | |||
| if doc != nil { | |||
| aw := index.NewAnalysisWork(s, doc, resultChan) | |||
| // put the work on the queue | |||
| s.analysisQueue.Queue(aw) | |||
| if len(batch.IndexOps) > 0 { | |||
| go func() { | |||
| for _, doc := range batch.IndexOps { | |||
| if doc != nil { | |||
| aw := index.NewAnalysisWork(s, doc, resultChan) | |||
| // put the work on the queue | |||
| s.analysisQueue.Queue(aw) | |||
| } | |||
| } | |||
| } | |||
| }() | |||
| }() | |||
| } | |||
| // wait for analysis result | |||
| analysisResults := make([]*index.AnalysisResult, int(numUpdates)) | |||
| var itemsDeQueued uint64 | |||
| var totalAnalysisSize int | |||
| for itemsDeQueued < numUpdates { | |||
| result := <-resultChan | |||
| resultSize := result.Size() | |||
| atomic.AddUint64(&s.iStats.analysisBytesAdded, uint64(resultSize)) | |||
| totalAnalysisSize += resultSize | |||
| analysisResults[itemsDeQueued] = result | |||
| itemsDeQueued++ | |||
| } | |||
| close(resultChan) | |||
| defer atomic.AddUint64(&s.iStats.analysisBytesRemoved, uint64(totalAnalysisSize)) | |||
| atomic.AddUint64(&s.stats.TotAnalysisTime, uint64(time.Since(start))) | |||
| atomic.AddUint64(&s.stats.analysisTime, uint64(time.Since(start))) | |||
| indexStart := time.Now() | |||
| // notify handlers that we're about to introduce a segment | |||
| s.fireEvent(EventKindBatchIntroductionStart, 0) | |||
| var newSegment segment.Segment | |||
| var bufBytes uint64 | |||
| if len(analysisResults) > 0 { | |||
| newSegment, err = zap.NewSegmentBase(mem.NewFromAnalyzedDocs(analysisResults), DefaultChunkFactor) | |||
| newSegment, bufBytes, err = zap.AnalysisResultsToSegmentBase(analysisResults, DefaultChunkFactor) | |||
| if err != nil { | |||
| return err | |||
| } | |||
| atomic.AddUint64(&s.iStats.newSegBufBytesAdded, bufBytes) | |||
| } else { | |||
| atomic.AddUint64(&s.stats.TotBatchesEmpty, 1) | |||
| } | |||
| err = s.prepareSegment(newSegment, ids, batch.InternalOps) | |||
| err = s.prepareSegment(newSegment, ids, batch.InternalOps, batch.PersistedCallback()) | |||
| if err != nil { | |||
| if newSegment != nil { | |||
| _ = newSegment.Close() | |||
| } | |||
| atomic.AddUint64(&s.stats.errors, 1) | |||
| atomic.AddUint64(&s.stats.TotOnErrors, 1) | |||
| } else { | |||
| atomic.AddUint64(&s.stats.updates, numUpdates) | |||
| atomic.AddUint64(&s.stats.deletes, numDeletes) | |||
| atomic.AddUint64(&s.stats.batches, 1) | |||
| atomic.AddUint64(&s.stats.numPlainTextBytesIndexed, numPlainTextBytes) | |||
| atomic.AddUint64(&s.stats.TotUpdates, numUpdates) | |||
| atomic.AddUint64(&s.stats.TotDeletes, numDeletes) | |||
| atomic.AddUint64(&s.stats.TotBatches, 1) | |||
| atomic.AddUint64(&s.stats.TotIndexedPlainTextBytes, numPlainTextBytes) | |||
| } | |||
| atomic.AddUint64(&s.iStats.newSegBufBytesRemoved, bufBytes) | |||
| atomic.AddUint64(&s.stats.TotIndexTime, uint64(time.Since(indexStart))) | |||
| return err | |||
| } | |||
| func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string, | |||
| internalOps map[string][]byte) error { | |||
| internalOps map[string][]byte, persistedCallback index.BatchCallback) error { | |||
| // new introduction | |||
| introduction := &segmentIntroduction{ | |||
| id: atomic.AddUint64(&s.nextSegmentID, 1), | |||
| data: newSegment, | |||
| ids: ids, | |||
| obsoletes: make(map[uint64]*roaring.Bitmap), | |||
| internal: internalOps, | |||
| applied: make(chan error), | |||
| id: atomic.AddUint64(&s.nextSegmentID, 1), | |||
| data: newSegment, | |||
| ids: ids, | |||
| obsoletes: make(map[uint64]*roaring.Bitmap), | |||
| internal: internalOps, | |||
| applied: make(chan error), | |||
| persistedCallback: persistedCallback, | |||
| } | |||
| if !s.unsafeBatch { | |||
| @@ -326,6 +401,8 @@ func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string, | |||
| root.AddRef() | |||
| s.rootLock.RUnlock() | |||
| defer func() { _ = root.DecRef() }() | |||
| for _, seg := range root.segment { | |||
| delta, err := seg.segment.DocNumbers(ids) | |||
| if err != nil { | |||
| @@ -334,7 +411,7 @@ func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string, | |||
| introduction.obsoletes[seg.id] = delta | |||
| } | |||
| _ = root.DecRef() | |||
| introStartTime := time.Now() | |||
| s.introductions <- introduction | |||
| @@ -348,6 +425,12 @@ func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string, | |||
| err = <-introduction.persisted | |||
| } | |||
| introTime := uint64(time.Since(introStartTime)) | |||
| atomic.AddUint64(&s.stats.TotBatchIntroTime, introTime) | |||
| if atomic.LoadUint64(&s.stats.MaxBatchIntroTime) < introTime { | |||
| atomic.StoreUint64(&s.stats.MaxBatchIntroTime, introTime) | |||
| } | |||
| return err | |||
| } | |||
| @@ -366,18 +449,69 @@ func (s *Scorch) DeleteInternal(key []byte) error { | |||
| // Reader returns a low-level accessor on the index data. Close it to | |||
| // release associated resources. | |||
| func (s *Scorch) Reader() (index.IndexReader, error) { | |||
| return s.currentSnapshot(), nil | |||
| } | |||
| func (s *Scorch) currentSnapshot() *IndexSnapshot { | |||
| s.rootLock.RLock() | |||
| rv := &Reader{root: s.root} | |||
| rv.root.AddRef() | |||
| rv := s.root | |||
| if rv != nil { | |||
| rv.AddRef() | |||
| } | |||
| s.rootLock.RUnlock() | |||
| return rv, nil | |||
| return rv | |||
| } | |||
| func (s *Scorch) Stats() json.Marshaler { | |||
| return s.stats | |||
| return &s.stats | |||
| } | |||
| func (s *Scorch) diskFileStats() (uint64, uint64) { | |||
| var numFilesOnDisk, numBytesUsedDisk uint64 | |||
| if s.path != "" { | |||
| finfos, err := ioutil.ReadDir(s.path) | |||
| if err == nil { | |||
| for _, finfo := range finfos { | |||
| if !finfo.IsDir() { | |||
| numBytesUsedDisk += uint64(finfo.Size()) | |||
| numFilesOnDisk++ | |||
| } | |||
| } | |||
| } | |||
| } | |||
| return numFilesOnDisk, numBytesUsedDisk | |||
| } | |||
| func (s *Scorch) StatsMap() map[string]interface{} { | |||
| m, _ := s.stats.statsMap() | |||
| m := s.stats.ToMap() | |||
| numFilesOnDisk, numBytesUsedDisk := s.diskFileStats() | |||
| m["CurOnDiskBytes"] = numBytesUsedDisk | |||
| m["CurOnDiskFiles"] = numFilesOnDisk | |||
| // TODO: consider one day removing these backwards compatible | |||
| // names for apps using the old names | |||
| m["updates"] = m["TotUpdates"] | |||
| m["deletes"] = m["TotDeletes"] | |||
| m["batches"] = m["TotBatches"] | |||
| m["errors"] = m["TotOnErrors"] | |||
| m["analysis_time"] = m["TotAnalysisTime"] | |||
| m["index_time"] = m["TotIndexTime"] | |||
| m["term_searchers_started"] = m["TotTermSearchersStarted"] | |||
| m["term_searchers_finished"] = m["TotTermSearchersFinished"] | |||
| m["num_plain_text_bytes_indexed"] = m["TotIndexedPlainTextBytes"] | |||
| m["num_items_introduced"] = m["TotIntroducedItems"] | |||
| m["num_items_persisted"] = m["TotPersistedItems"] | |||
| m["num_recs_to_persist"] = m["TotItemsToPersist"] | |||
| m["num_bytes_used_disk"] = m["CurOnDiskBytes"] | |||
| m["num_files_on_disk"] = m["CurOnDiskFiles"] | |||
| m["num_root_memorysegments"] = m["TotMemorySegmentsAtRoot"] | |||
| m["num_root_filesegments"] = m["TotFileSegmentsAtRoot"] | |||
| m["num_persister_nap_pause_completed"] = m["TotPersisterNapPauseCompleted"] | |||
| m["num_persister_nap_merger_break"] = m["TotPersisterMergerNapBreak"] | |||
| m["total_compaction_written_bytes"] = m["TotFileMergeWrittenBytes"] | |||
| return m | |||
| } | |||
| @@ -394,7 +528,7 @@ func (s *Scorch) Analyze(d *document.Document) *index.AnalysisResult { | |||
| rv.Analyzed[i] = tokenFreqs | |||
| rv.Length[i] = fieldLength | |||
| if len(d.CompositeFields) > 0 { | |||
| if len(d.CompositeFields) > 0 && field.Name() != "_id" { | |||
| // see if any of the composite fields need this | |||
| for _, compositeField := range d.CompositeFields { | |||
| compositeField.Compose(field.Name(), fieldLength, tokenFreqs) | |||
| @@ -418,20 +552,43 @@ func (s *Scorch) AddEligibleForRemoval(epoch uint64) { | |||
| s.rootLock.Unlock() | |||
| } | |||
| func (s *Scorch) MemoryUsed() uint64 { | |||
| var memUsed uint64 | |||
| s.rootLock.RLock() | |||
| if s.root != nil { | |||
| for _, segmentSnapshot := range s.root.segment { | |||
| memUsed += 8 /* size of id -> uint64 */ + | |||
| segmentSnapshot.segment.SizeInBytes() | |||
| if segmentSnapshot.deleted != nil { | |||
| memUsed += segmentSnapshot.deleted.GetSizeInBytes() | |||
| } | |||
| memUsed += segmentSnapshot.cachedDocs.sizeInBytes() | |||
| } | |||
| func (s *Scorch) MemoryUsed() (memUsed uint64) { | |||
| indexSnapshot := s.currentSnapshot() | |||
| if indexSnapshot == nil { | |||
| return | |||
| } | |||
| s.rootLock.RUnlock() | |||
| defer func() { | |||
| _ = indexSnapshot.Close() | |||
| }() | |||
| // Account for current root snapshot overhead | |||
| memUsed += uint64(indexSnapshot.Size()) | |||
| // Account for snapshot that the persister may be working on | |||
| persistEpoch := atomic.LoadUint64(&s.iStats.persistEpoch) | |||
| persistSnapshotSize := atomic.LoadUint64(&s.iStats.persistSnapshotSize) | |||
| if persistEpoch != 0 && indexSnapshot.epoch > persistEpoch { | |||
| // the snapshot that the persister is working on isn't the same as | |||
| // the current snapshot | |||
| memUsed += persistSnapshotSize | |||
| } | |||
| // Account for snapshot that the merger may be working on | |||
| mergeEpoch := atomic.LoadUint64(&s.iStats.mergeEpoch) | |||
| mergeSnapshotSize := atomic.LoadUint64(&s.iStats.mergeSnapshotSize) | |||
| if mergeEpoch != 0 && indexSnapshot.epoch > mergeEpoch { | |||
| // the snapshot that the merger is working on isn't the same as | |||
| // the current snapshot | |||
| memUsed += mergeSnapshotSize | |||
| } | |||
| memUsed += (atomic.LoadUint64(&s.iStats.newSegBufBytesAdded) - | |||
| atomic.LoadUint64(&s.iStats.newSegBufBytesRemoved)) | |||
| memUsed += (atomic.LoadUint64(&s.iStats.analysisBytesAdded) - | |||
| atomic.LoadUint64(&s.iStats.analysisBytesRemoved)) | |||
| return memUsed | |||
| } | |||
| @@ -450,3 +607,15 @@ func (s *Scorch) unmarkIneligibleForRemoval(filename string) { | |||
| func init() { | |||
| registry.RegisterIndexType(Name, NewScorch) | |||
| } | |||
| func parseToInteger(i interface{}) (int, error) { | |||
| switch v := i.(type) { | |||
| case float64: | |||
| return int(v), nil | |||
| case int: | |||
| return v, nil | |||
| default: | |||
| return 0, fmt.Errorf("expects int or float64 value") | |||
| } | |||
| } | |||
| @@ -17,6 +17,7 @@ package segment | |||
| import ( | |||
| "github.com/RoaringBitmap/roaring" | |||
| "github.com/blevesearch/bleve/index" | |||
| "github.com/couchbase/vellum" | |||
| ) | |||
| type EmptySegment struct{} | |||
| @@ -29,6 +30,10 @@ func (e *EmptySegment) VisitDocument(num uint64, visitor DocumentFieldValueVisit | |||
| return nil | |||
| } | |||
| func (e *EmptySegment) DocID(num uint64) ([]byte, error) { | |||
| return nil, nil | |||
| } | |||
| func (e *EmptySegment) Count() uint64 { | |||
| return 0 | |||
| } | |||
| @@ -46,6 +51,10 @@ func (e *EmptySegment) Close() error { | |||
| return nil | |||
| } | |||
| func (e *EmptySegment) Size() uint64 { | |||
| return 0 | |||
| } | |||
| func (e *EmptySegment) AddRef() { | |||
| } | |||
| @@ -55,8 +64,8 @@ func (e *EmptySegment) DecRef() error { | |||
| type EmptyDictionary struct{} | |||
| func (e *EmptyDictionary) PostingsList(term string, | |||
| except *roaring.Bitmap) (PostingsList, error) { | |||
| func (e *EmptyDictionary) PostingsList(term []byte, | |||
| except *roaring.Bitmap, prealloc PostingsList) (PostingsList, error) { | |||
| return &EmptyPostingsList{}, nil | |||
| } | |||
| @@ -72,18 +81,37 @@ func (e *EmptyDictionary) RangeIterator(start, end string) DictionaryIterator { | |||
| return &EmptyDictionaryIterator{} | |||
| } | |||
| func (e *EmptyDictionary) AutomatonIterator(a vellum.Automaton, | |||
| startKeyInclusive, endKeyExclusive []byte) DictionaryIterator { | |||
| return &EmptyDictionaryIterator{} | |||
| } | |||
| func (e *EmptyDictionary) OnlyIterator(onlyTerms [][]byte, | |||
| includeCount bool) DictionaryIterator { | |||
| return &EmptyDictionaryIterator{} | |||
| } | |||
| type EmptyDictionaryIterator struct{} | |||
| func (e *EmptyDictionaryIterator) Next() (*index.DictEntry, error) { | |||
| return nil, nil | |||
| } | |||
| func (e *EmptyPostingsIterator) Advance(uint64) (Posting, error) { | |||
| return nil, nil | |||
| } | |||
| type EmptyPostingsList struct{} | |||
| func (e *EmptyPostingsList) Iterator() PostingsIterator { | |||
| func (e *EmptyPostingsList) Iterator(includeFreq, includeNorm, includeLocations bool, | |||
| prealloc PostingsIterator) PostingsIterator { | |||
| return &EmptyPostingsIterator{} | |||
| } | |||
| func (e *EmptyPostingsList) Size() int { | |||
| return 0 | |||
| } | |||
| func (e *EmptyPostingsList) Count() uint64 { | |||
| return 0 | |||
| } | |||
| @@ -93,3 +121,9 @@ type EmptyPostingsIterator struct{} | |||
| func (e *EmptyPostingsIterator) Next() (Posting, error) { | |||
| return nil, nil | |||
| } | |||
| func (e *EmptyPostingsIterator) Size() int { | |||
| return 0 | |||
| } | |||
| var AnEmptyPostingsIterator = &EmptyPostingsIterator{} | |||
| @@ -1,321 +0,0 @@ | |||
| // Copyright (c) 2017 Couchbase, Inc. | |||
| // | |||
| // Licensed under the Apache License, Version 2.0 (the "License"); | |||
| // you may not use this file except in compliance with the License. | |||
| // You may obtain a copy of the License at | |||
| // | |||
| // http://www.apache.org/licenses/LICENSE-2.0 | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software | |||
| // distributed under the License is distributed on an "AS IS" BASIS, | |||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| // See the License for the specific language governing permissions and | |||
| // limitations under the License. | |||
| package mem | |||
| import ( | |||
| "math" | |||
| "sort" | |||
| "github.com/RoaringBitmap/roaring" | |||
| "github.com/blevesearch/bleve/analysis" | |||
| "github.com/blevesearch/bleve/document" | |||
| "github.com/blevesearch/bleve/index" | |||
| ) | |||
| // NewFromAnalyzedDocs places the analyzed document mutations into a new segment | |||
| func NewFromAnalyzedDocs(results []*index.AnalysisResult) *Segment { | |||
| s := New() | |||
| // ensure that _id field get fieldID 0 | |||
| s.getOrDefineField("_id") | |||
| // fill Dicts/DictKeys and preallocate memory | |||
| s.initializeDict(results) | |||
| // walk each doc | |||
| for _, result := range results { | |||
| s.processDocument(result) | |||
| } | |||
| // go back and sort the dictKeys | |||
| for _, dict := range s.DictKeys { | |||
| sort.Strings(dict) | |||
| } | |||
| // compute memory usage of segment | |||
| s.updateSizeInBytes() | |||
| // professional debugging | |||
| // | |||
| // log.Printf("fields: %v\n", s.FieldsMap) | |||
| // log.Printf("fieldsInv: %v\n", s.FieldsInv) | |||
| // log.Printf("fieldsLoc: %v\n", s.FieldsLoc) | |||
| // log.Printf("dicts: %v\n", s.Dicts) | |||
| // log.Printf("dict keys: %v\n", s.DictKeys) | |||
| // for i, posting := range s.Postings { | |||
| // log.Printf("posting %d: %v\n", i, posting) | |||
| // } | |||
| // for i, freq := range s.Freqs { | |||
| // log.Printf("freq %d: %v\n", i, freq) | |||
| // } | |||
| // for i, norm := range s.Norms { | |||
| // log.Printf("norm %d: %v\n", i, norm) | |||
| // } | |||
| // for i, field := range s.Locfields { | |||
| // log.Printf("field %d: %v\n", i, field) | |||
| // } | |||
| // for i, start := range s.Locstarts { | |||
| // log.Printf("start %d: %v\n", i, start) | |||
| // } | |||
| // for i, end := range s.Locends { | |||
| // log.Printf("end %d: %v\n", i, end) | |||
| // } | |||
| // for i, pos := range s.Locpos { | |||
| // log.Printf("pos %d: %v\n", i, pos) | |||
| // } | |||
| // for i, apos := range s.Locarraypos { | |||
| // log.Printf("apos %d: %v\n", i, apos) | |||
| // } | |||
| // log.Printf("stored: %v\n", s.Stored) | |||
| // log.Printf("stored types: %v\n", s.StoredTypes) | |||
| // log.Printf("stored pos: %v\n", s.StoredPos) | |||
| return s | |||
| } | |||
| // fill Dicts/DictKeys and preallocate memory for postings | |||
| func (s *Segment) initializeDict(results []*index.AnalysisResult) { | |||
| var numPostingsLists int | |||
| numTermsPerPostingsList := make([]int, 0, 64) // Keyed by postings list id. | |||
| numLocsPerPostingsList := make([]int, 0, 64) // Keyed by postings list id. | |||
| var numTokenFrequencies int | |||
| var totLocs int | |||
| // initial scan for all fieldID's to sort them | |||
| for _, result := range results { | |||
| for _, field := range result.Document.CompositeFields { | |||
| s.getOrDefineField(field.Name()) | |||
| } | |||
| for _, field := range result.Document.Fields { | |||
| s.getOrDefineField(field.Name()) | |||
| } | |||
| } | |||
| sort.Strings(s.FieldsInv[1:]) // keep _id as first field | |||
| s.FieldsMap = make(map[string]uint16, len(s.FieldsInv)) | |||
| for fieldID, fieldName := range s.FieldsInv { | |||
| s.FieldsMap[fieldName] = uint16(fieldID + 1) | |||
| } | |||
| processField := func(fieldID uint16, tfs analysis.TokenFrequencies) { | |||
| for term, tf := range tfs { | |||
| pidPlus1, exists := s.Dicts[fieldID][term] | |||
| if !exists { | |||
| numPostingsLists++ | |||
| pidPlus1 = uint64(numPostingsLists) | |||
| s.Dicts[fieldID][term] = pidPlus1 | |||
| s.DictKeys[fieldID] = append(s.DictKeys[fieldID], term) | |||
| numTermsPerPostingsList = append(numTermsPerPostingsList, 0) | |||
| numLocsPerPostingsList = append(numLocsPerPostingsList, 0) | |||
| } | |||
| pid := pidPlus1 - 1 | |||
| numTermsPerPostingsList[pid] += 1 | |||
| numLocsPerPostingsList[pid] += len(tf.Locations) | |||
| totLocs += len(tf.Locations) | |||
| } | |||
| numTokenFrequencies += len(tfs) | |||
| } | |||
| for _, result := range results { | |||
| // walk each composite field | |||
| for _, field := range result.Document.CompositeFields { | |||
| fieldID := uint16(s.getOrDefineField(field.Name())) | |||
| _, tf := field.Analyze() | |||
| processField(fieldID, tf) | |||
| } | |||
| // walk each field | |||
| for i, field := range result.Document.Fields { | |||
| fieldID := uint16(s.getOrDefineField(field.Name())) | |||
| tf := result.Analyzed[i] | |||
| processField(fieldID, tf) | |||
| } | |||
| } | |||
| s.Postings = make([]*roaring.Bitmap, numPostingsLists) | |||
| for i := 0; i < numPostingsLists; i++ { | |||
| s.Postings[i] = roaring.New() | |||
| } | |||
| s.PostingsLocs = make([]*roaring.Bitmap, numPostingsLists) | |||
| for i := 0; i < numPostingsLists; i++ { | |||
| s.PostingsLocs[i] = roaring.New() | |||
| } | |||
| // Preallocate big, contiguous backing arrays. | |||
| auint64Backing := make([][]uint64, numPostingsLists*4+totLocs) // For Freqs, Locstarts, Locends, Locpos, sub-Locarraypos. | |||
| uint64Backing := make([]uint64, numTokenFrequencies+totLocs*3) // For sub-Freqs, sub-Locstarts, sub-Locends, sub-Locpos. | |||
| float32Backing := make([]float32, numTokenFrequencies) // For sub-Norms. | |||
| uint16Backing := make([]uint16, totLocs) // For sub-Locfields. | |||
| // Point top-level slices to the backing arrays. | |||
| s.Freqs = auint64Backing[0:numPostingsLists] | |||
| auint64Backing = auint64Backing[numPostingsLists:] | |||
| s.Norms = make([][]float32, numPostingsLists) | |||
| s.Locfields = make([][]uint16, numPostingsLists) | |||
| s.Locstarts = auint64Backing[0:numPostingsLists] | |||
| auint64Backing = auint64Backing[numPostingsLists:] | |||
| s.Locends = auint64Backing[0:numPostingsLists] | |||
| auint64Backing = auint64Backing[numPostingsLists:] | |||
| s.Locpos = auint64Backing[0:numPostingsLists] | |||
| auint64Backing = auint64Backing[numPostingsLists:] | |||
| s.Locarraypos = make([][][]uint64, numPostingsLists) | |||
| // Point sub-slices to the backing arrays. | |||
| for pid, numTerms := range numTermsPerPostingsList { | |||
| s.Freqs[pid] = uint64Backing[0:0] | |||
| uint64Backing = uint64Backing[numTerms:] | |||
| s.Norms[pid] = float32Backing[0:0] | |||
| float32Backing = float32Backing[numTerms:] | |||
| } | |||
| for pid, numLocs := range numLocsPerPostingsList { | |||
| s.Locfields[pid] = uint16Backing[0:0] | |||
| uint16Backing = uint16Backing[numLocs:] | |||
| s.Locstarts[pid] = uint64Backing[0:0] | |||
| uint64Backing = uint64Backing[numLocs:] | |||
| s.Locends[pid] = uint64Backing[0:0] | |||
| uint64Backing = uint64Backing[numLocs:] | |||
| s.Locpos[pid] = uint64Backing[0:0] | |||
| uint64Backing = uint64Backing[numLocs:] | |||
| s.Locarraypos[pid] = auint64Backing[0:0] | |||
| auint64Backing = auint64Backing[numLocs:] | |||
| } | |||
| } | |||
| func (s *Segment) processDocument(result *index.AnalysisResult) { | |||
| // used to collate information across fields | |||
| docMap := make(map[uint16]analysis.TokenFrequencies, len(s.FieldsMap)) | |||
| fieldLens := make(map[uint16]int, len(s.FieldsMap)) | |||
| docNum := uint64(s.addDocument()) | |||
| processField := func(field uint16, name string, l int, tf analysis.TokenFrequencies) { | |||
| fieldLens[field] += l | |||
| if existingFreqs, ok := docMap[field]; ok { | |||
| existingFreqs.MergeAll(name, tf) | |||
| } else { | |||
| docMap[field] = tf | |||
| } | |||
| } | |||
| storeField := func(docNum uint64, field uint16, typ byte, val []byte, pos []uint64) { | |||
| s.Stored[docNum][field] = append(s.Stored[docNum][field], val) | |||
| s.StoredTypes[docNum][field] = append(s.StoredTypes[docNum][field], typ) | |||
| s.StoredPos[docNum][field] = append(s.StoredPos[docNum][field], pos) | |||
| } | |||
| // walk each composite field | |||
| for _, field := range result.Document.CompositeFields { | |||
| fieldID := uint16(s.getOrDefineField(field.Name())) | |||
| l, tf := field.Analyze() | |||
| processField(fieldID, field.Name(), l, tf) | |||
| } | |||
| // walk each field | |||
| for i, field := range result.Document.Fields { | |||
| fieldID := uint16(s.getOrDefineField(field.Name())) | |||
| l := result.Length[i] | |||
| tf := result.Analyzed[i] | |||
| processField(fieldID, field.Name(), l, tf) | |||
| if field.Options().IsStored() { | |||
| storeField(docNum, fieldID, encodeFieldType(field), field.Value(), field.ArrayPositions()) | |||
| } | |||
| if field.Options().IncludeDocValues() { | |||
| s.DocValueFields[fieldID] = true | |||
| } | |||
| } | |||
| // now that its been rolled up into docMap, walk that | |||
| for fieldID, tokenFrequencies := range docMap { | |||
| for term, tokenFreq := range tokenFrequencies { | |||
| pid := s.Dicts[fieldID][term] - 1 | |||
| bs := s.Postings[pid] | |||
| bs.AddInt(int(docNum)) | |||
| s.Freqs[pid] = append(s.Freqs[pid], uint64(tokenFreq.Frequency())) | |||
| s.Norms[pid] = append(s.Norms[pid], float32(1.0/math.Sqrt(float64(fieldLens[fieldID])))) | |||
| locationBS := s.PostingsLocs[pid] | |||
| if len(tokenFreq.Locations) > 0 { | |||
| locationBS.AddInt(int(docNum)) | |||
| for _, loc := range tokenFreq.Locations { | |||
| var locf = fieldID | |||
| if loc.Field != "" { | |||
| locf = uint16(s.getOrDefineField(loc.Field)) | |||
| } | |||
| s.Locfields[pid] = append(s.Locfields[pid], locf) | |||
| s.Locstarts[pid] = append(s.Locstarts[pid], uint64(loc.Start)) | |||
| s.Locends[pid] = append(s.Locends[pid], uint64(loc.End)) | |||
| s.Locpos[pid] = append(s.Locpos[pid], uint64(loc.Position)) | |||
| if len(loc.ArrayPositions) > 0 { | |||
| s.Locarraypos[pid] = append(s.Locarraypos[pid], loc.ArrayPositions) | |||
| } else { | |||
| s.Locarraypos[pid] = append(s.Locarraypos[pid], nil) | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| func (s *Segment) getOrDefineField(name string) int { | |||
| fieldIDPlus1, ok := s.FieldsMap[name] | |||
| if !ok { | |||
| fieldIDPlus1 = uint16(len(s.FieldsInv) + 1) | |||
| s.FieldsMap[name] = fieldIDPlus1 | |||
| s.FieldsInv = append(s.FieldsInv, name) | |||
| s.Dicts = append(s.Dicts, make(map[string]uint64)) | |||
| s.DictKeys = append(s.DictKeys, make([]string, 0)) | |||
| } | |||
| return int(fieldIDPlus1 - 1) | |||
| } | |||
| func (s *Segment) addDocument() int { | |||
| docNum := len(s.Stored) | |||
| s.Stored = append(s.Stored, map[uint16][][]byte{}) | |||
| s.StoredTypes = append(s.StoredTypes, map[uint16][]byte{}) | |||
| s.StoredPos = append(s.StoredPos, map[uint16][][]uint64{}) | |||
| return docNum | |||
| } | |||
| func encodeFieldType(f document.Field) byte { | |||
| fieldType := byte('x') | |||
| switch f.(type) { | |||
| case *document.TextField: | |||
| fieldType = 't' | |||
| case *document.NumericField: | |||
| fieldType = 'n' | |||
| case *document.DateTimeField: | |||
| fieldType = 'd' | |||
| case *document.BooleanField: | |||
| fieldType = 'b' | |||
| case *document.GeoPointField: | |||
| fieldType = 'g' | |||
| case *document.CompositeField: | |||
| fieldType = 'c' | |||
| } | |||
| return fieldType | |||
| } | |||
| @@ -1,103 +0,0 @@ | |||
| // Copyright (c) 2017 Couchbase, Inc. | |||
| // | |||
| // Licensed under the Apache License, Version 2.0 (the "License"); | |||
| // you may not use this file except in compliance with the License. | |||
| // You may obtain a copy of the License at | |||
| // | |||
| // http://www.apache.org/licenses/LICENSE-2.0 | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software | |||
| // distributed under the License is distributed on an "AS IS" BASIS, | |||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| // See the License for the specific language governing permissions and | |||
| // limitations under the License. | |||
| package mem | |||
| import ( | |||
| "sort" | |||
| "strings" | |||
| "github.com/RoaringBitmap/roaring" | |||
| "github.com/blevesearch/bleve/index" | |||
| "github.com/blevesearch/bleve/index/scorch/segment" | |||
| ) | |||
| // Dictionary is the in-memory representation of the term dictionary | |||
| type Dictionary struct { | |||
| segment *Segment | |||
| field string | |||
| fieldID uint16 | |||
| } | |||
| // PostingsList returns the postings list for the specified term | |||
| func (d *Dictionary) PostingsList(term string, | |||
| except *roaring.Bitmap) (segment.PostingsList, error) { | |||
| return &PostingsList{ | |||
| dictionary: d, | |||
| term: term, | |||
| postingsID: d.segment.Dicts[d.fieldID][term], | |||
| except: except, | |||
| }, nil | |||
| } | |||
| // Iterator returns an iterator for this dictionary | |||
| func (d *Dictionary) Iterator() segment.DictionaryIterator { | |||
| return &DictionaryIterator{ | |||
| d: d, | |||
| } | |||
| } | |||
| // PrefixIterator returns an iterator which only visits terms having the | |||
| // the specified prefix | |||
| func (d *Dictionary) PrefixIterator(prefix string) segment.DictionaryIterator { | |||
| offset := sort.SearchStrings(d.segment.DictKeys[d.fieldID], prefix) | |||
| return &DictionaryIterator{ | |||
| d: d, | |||
| prefix: prefix, | |||
| offset: offset, | |||
| } | |||
| } | |||
| // RangeIterator returns an iterator which only visits terms between the | |||
| // start and end terms. NOTE: bleve.index API specifies the end is inclusive. | |||
| func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator { | |||
| offset := sort.SearchStrings(d.segment.DictKeys[d.fieldID], start) | |||
| return &DictionaryIterator{ | |||
| d: d, | |||
| offset: offset, | |||
| end: end, | |||
| } | |||
| } | |||
| // DictionaryIterator is an iterator for term dictionary | |||
| type DictionaryIterator struct { | |||
| d *Dictionary | |||
| prefix string | |||
| end string | |||
| offset int | |||
| dictEntry index.DictEntry // reused across Next()'s | |||
| } | |||
| // Next returns the next entry in the dictionary | |||
| func (d *DictionaryIterator) Next() (*index.DictEntry, error) { | |||
| if d.offset > len(d.d.segment.DictKeys[d.d.fieldID])-1 { | |||
| return nil, nil | |||
| } | |||
| next := d.d.segment.DictKeys[d.d.fieldID][d.offset] | |||
| // check prefix | |||
| if d.prefix != "" && !strings.HasPrefix(next, d.prefix) { | |||
| return nil, nil | |||
| } | |||
| // check end (bleve.index API demands inclusive end) | |||
| if d.end != "" && next > d.end { | |||
| return nil, nil | |||
| } | |||
| d.offset++ | |||
| postingID := d.d.segment.Dicts[d.d.fieldID][next] | |||
| d.dictEntry.Term = next | |||
| d.dictEntry.Count = d.d.segment.Postings[postingID-1].GetCardinality() | |||
| return &d.dictEntry, nil | |||
| } | |||
| @@ -1,178 +0,0 @@ | |||
| // Copyright (c) 2017 Couchbase, Inc. | |||
| // | |||
| // Licensed under the Apache License, Version 2.0 (the "License"); | |||
| // you may not use this file except in compliance with the License. | |||
| // You may obtain a copy of the License at | |||
| // | |||
| // http://www.apache.org/licenses/LICENSE-2.0 | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software | |||
| // distributed under the License is distributed on an "AS IS" BASIS, | |||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| // See the License for the specific language governing permissions and | |||
| // limitations under the License. | |||
| package mem | |||
| import ( | |||
| "github.com/RoaringBitmap/roaring" | |||
| "github.com/blevesearch/bleve/index/scorch/segment" | |||
| ) | |||
| // PostingsList is an in-memory represenation of a postings list | |||
| type PostingsList struct { | |||
| dictionary *Dictionary | |||
| term string | |||
| postingsID uint64 | |||
| except *roaring.Bitmap | |||
| } | |||
| // Count returns the number of items on this postings list | |||
| func (p *PostingsList) Count() uint64 { | |||
| var rv uint64 | |||
| if p.postingsID > 0 { | |||
| rv = p.dictionary.segment.Postings[p.postingsID-1].GetCardinality() | |||
| if p.except != nil { | |||
| except := p.except.GetCardinality() | |||
| if except > rv { | |||
| // avoid underflow | |||
| except = rv | |||
| } | |||
| rv -= except | |||
| } | |||
| } | |||
| return rv | |||
| } | |||
| // Iterator returns an iterator for this postings list | |||
| func (p *PostingsList) Iterator() segment.PostingsIterator { | |||
| rv := &PostingsIterator{ | |||
| postings: p, | |||
| } | |||
| if p.postingsID > 0 { | |||
| allbits := p.dictionary.segment.Postings[p.postingsID-1] | |||
| rv.locations = p.dictionary.segment.PostingsLocs[p.postingsID-1] | |||
| rv.all = allbits.Iterator() | |||
| if p.except != nil { | |||
| allExcept := allbits.Clone() | |||
| allExcept.AndNot(p.except) | |||
| rv.actual = allExcept.Iterator() | |||
| } else { | |||
| rv.actual = allbits.Iterator() | |||
| } | |||
| } | |||
| return rv | |||
| } | |||
| // PostingsIterator provides a way to iterate through the postings list | |||
| type PostingsIterator struct { | |||
| postings *PostingsList | |||
| all roaring.IntIterable | |||
| locations *roaring.Bitmap | |||
| offset int | |||
| locoffset int | |||
| actual roaring.IntIterable | |||
| } | |||
| // Next returns the next posting on the postings list, or nil at the end | |||
| func (i *PostingsIterator) Next() (segment.Posting, error) { | |||
| if i.actual == nil || !i.actual.HasNext() { | |||
| return nil, nil | |||
| } | |||
| n := i.actual.Next() | |||
| allN := i.all.Next() | |||
| // n is the next actual hit (excluding some postings) | |||
| // allN is the next hit in the full postings | |||
| // if they don't match, adjust offsets to factor in item we're skipping over | |||
| // incr the all iterator, and check again | |||
| for allN != n { | |||
| i.locoffset += int(i.postings.dictionary.segment.Freqs[i.postings.postingsID-1][i.offset]) | |||
| i.offset++ | |||
| allN = i.all.Next() | |||
| } | |||
| rv := &Posting{ | |||
| iterator: i, | |||
| docNum: uint64(n), | |||
| offset: i.offset, | |||
| locoffset: i.locoffset, | |||
| hasLoc: i.locations.Contains(n), | |||
| } | |||
| i.locoffset += int(i.postings.dictionary.segment.Freqs[i.postings.postingsID-1][i.offset]) | |||
| i.offset++ | |||
| return rv, nil | |||
| } | |||
| // Posting is a single entry in a postings list | |||
| type Posting struct { | |||
| iterator *PostingsIterator | |||
| docNum uint64 | |||
| offset int | |||
| locoffset int | |||
| hasLoc bool | |||
| } | |||
| // Number returns the document number of this posting in this segment | |||
| func (p *Posting) Number() uint64 { | |||
| return p.docNum | |||
| } | |||
| // Frequency returns the frequence of occurance of this term in this doc/field | |||
| func (p *Posting) Frequency() uint64 { | |||
| return p.iterator.postings.dictionary.segment.Freqs[p.iterator.postings.postingsID-1][p.offset] | |||
| } | |||
| // Norm returns the normalization factor for this posting | |||
| func (p *Posting) Norm() float64 { | |||
| return float64(p.iterator.postings.dictionary.segment.Norms[p.iterator.postings.postingsID-1][p.offset]) | |||
| } | |||
| // Locations returns the location information for each occurance | |||
| func (p *Posting) Locations() []segment.Location { | |||
| if !p.hasLoc { | |||
| return nil | |||
| } | |||
| freq := int(p.Frequency()) | |||
| rv := make([]segment.Location, freq) | |||
| for i := 0; i < freq; i++ { | |||
| rv[i] = &Location{ | |||
| p: p, | |||
| offset: p.locoffset + i, | |||
| } | |||
| } | |||
| return rv | |||
| } | |||
| // Location represents the location of a single occurance | |||
| type Location struct { | |||
| p *Posting | |||
| offset int | |||
| } | |||
| // Field returns the name of the field (useful in composite fields to know | |||
| // which original field the value came from) | |||
| func (l *Location) Field() string { | |||
| return l.p.iterator.postings.dictionary.segment.FieldsInv[l.p.iterator.postings.dictionary.segment.Locfields[l.p.iterator.postings.postingsID-1][l.offset]] | |||
| } | |||
| // Start returns the start byte offset of this occurance | |||
| func (l *Location) Start() uint64 { | |||
| return l.p.iterator.postings.dictionary.segment.Locstarts[l.p.iterator.postings.postingsID-1][l.offset] | |||
| } | |||
| // End returns the end byte offset of this occurance | |||
| func (l *Location) End() uint64 { | |||
| return l.p.iterator.postings.dictionary.segment.Locends[l.p.iterator.postings.postingsID-1][l.offset] | |||
| } | |||
| // Pos returns the 1-based phrase position of this occurance | |||
| func (l *Location) Pos() uint64 { | |||
| return l.p.iterator.postings.dictionary.segment.Locpos[l.p.iterator.postings.postingsID-1][l.offset] | |||
| } | |||
| // ArrayPositions returns the array position vector associated with this occurance | |||
| func (l *Location) ArrayPositions() []uint64 { | |||
| return l.p.iterator.postings.dictionary.segment.Locarraypos[l.p.iterator.postings.postingsID-1][l.offset] | |||
| } | |||
| @@ -1,289 +0,0 @@ | |||
| // Copyright (c) 2017 Couchbase, Inc. | |||
| // | |||
| // Licensed under the Apache License, Version 2.0 (the "License"); | |||
| // you may not use this file except in compliance with the License. | |||
| // You may obtain a copy of the License at | |||
| // | |||
| // http://www.apache.org/licenses/LICENSE-2.0 | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software | |||
| // distributed under the License is distributed on an "AS IS" BASIS, | |||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| // See the License for the specific language governing permissions and | |||
| // limitations under the License. | |||
| package mem | |||
| import ( | |||
| "fmt" | |||
| "github.com/RoaringBitmap/roaring" | |||
| "github.com/blevesearch/bleve/index/scorch/segment" | |||
| ) | |||
| // _id field is always guaranteed to have fieldID of 0 | |||
| const idFieldID uint16 = 0 | |||
| // KNOWN ISSUES | |||
| // - LIMITATION - we decided whether or not to store term vectors for a field | |||
| // at the segment level, based on the first definition of a | |||
| // field we see. in normal bleve usage this is fine, all | |||
| // instances of a field definition will be the same. however, | |||
| // advanced users may violate this and provide unique field | |||
| // definitions with each document. this segment does not | |||
| // support this usage. | |||
| // TODO | |||
| // - need better testing of multiple docs, iterating freqs, locations and | |||
| // and verifying the correct results are returned | |||
| // Segment is an in memory implementation of scorch.Segment | |||
| type Segment struct { | |||
| // FieldsMap adds 1 to field id to avoid zero value issues | |||
| // name -> field id + 1 | |||
| FieldsMap map[string]uint16 | |||
| // FieldsInv is the inverse of FieldsMap | |||
| // field id -> name | |||
| FieldsInv []string | |||
| // Term dictionaries for each field | |||
| // field id -> term -> postings list id + 1 | |||
| Dicts []map[string]uint64 | |||
| // Terms for each field, where terms are sorted ascending | |||
| // field id -> []term | |||
| DictKeys [][]string | |||
| // Postings list | |||
| // postings list id -> bitmap by docNum | |||
| Postings []*roaring.Bitmap | |||
| // Postings list has locations | |||
| PostingsLocs []*roaring.Bitmap | |||
| // Term frequencies | |||
| // postings list id -> Freqs (one for each hit in bitmap) | |||
| Freqs [][]uint64 | |||
| // Field norms | |||
| // postings list id -> Norms (one for each hit in bitmap) | |||
| Norms [][]float32 | |||
| // Field/start/end/pos/locarraypos | |||
| // postings list id -> start/end/pos/locarraypos (one for each freq) | |||
| Locfields [][]uint16 | |||
| Locstarts [][]uint64 | |||
| Locends [][]uint64 | |||
| Locpos [][]uint64 | |||
| Locarraypos [][][]uint64 | |||
| // Stored field values | |||
| // docNum -> field id -> slice of values (each value []byte) | |||
| Stored []map[uint16][][]byte | |||
| // Stored field types | |||
| // docNum -> field id -> slice of types (each type byte) | |||
| StoredTypes []map[uint16][]byte | |||
| // Stored field array positions | |||
| // docNum -> field id -> slice of array positions (each is []uint64) | |||
| StoredPos []map[uint16][][]uint64 | |||
| // For storing the docValue persisted fields | |||
| DocValueFields map[uint16]bool | |||
| // Footprint of the segment, updated when analyzed document mutations | |||
| // are added into the segment | |||
| sizeInBytes uint64 | |||
| } | |||
| // New builds a new empty Segment | |||
| func New() *Segment { | |||
| return &Segment{ | |||
| FieldsMap: map[string]uint16{}, | |||
| DocValueFields: map[uint16]bool{}, | |||
| } | |||
| } | |||
| func (s *Segment) updateSizeInBytes() { | |||
| var sizeInBytes uint64 | |||
| // FieldsMap, FieldsInv | |||
| for k, _ := range s.FieldsMap { | |||
| sizeInBytes += uint64((len(k)+int(segment.SizeOfString))*2 + | |||
| 2 /* size of uint16 */) | |||
| } | |||
| // overhead from the data structures | |||
| sizeInBytes += (segment.SizeOfMap + segment.SizeOfSlice) | |||
| // Dicts, DictKeys | |||
| for _, entry := range s.Dicts { | |||
| for k, _ := range entry { | |||
| sizeInBytes += uint64((len(k)+int(segment.SizeOfString))*2 + | |||
| 8 /* size of uint64 */) | |||
| } | |||
| // overhead from the data structures | |||
| sizeInBytes += (segment.SizeOfMap + segment.SizeOfSlice) | |||
| } | |||
| sizeInBytes += (segment.SizeOfSlice * 2) | |||
| // Postings, PostingsLocs | |||
| for i := 0; i < len(s.Postings); i++ { | |||
| sizeInBytes += (s.Postings[i].GetSizeInBytes() + segment.SizeOfPointer) + | |||
| (s.PostingsLocs[i].GetSizeInBytes() + segment.SizeOfPointer) | |||
| } | |||
| sizeInBytes += (segment.SizeOfSlice * 2) | |||
| // Freqs, Norms | |||
| for i := 0; i < len(s.Freqs); i++ { | |||
| sizeInBytes += uint64(len(s.Freqs[i])*8 /* size of uint64 */ + | |||
| len(s.Norms[i])*4 /* size of float32 */) + | |||
| (segment.SizeOfSlice * 2) | |||
| } | |||
| sizeInBytes += (segment.SizeOfSlice * 2) | |||
| // Location data | |||
| for i := 0; i < len(s.Locfields); i++ { | |||
| sizeInBytes += uint64(len(s.Locfields[i])*2 /* size of uint16 */ + | |||
| len(s.Locstarts[i])*8 /* size of uint64 */ + | |||
| len(s.Locends[i])*8 /* size of uint64 */ + | |||
| len(s.Locpos[i])*8 /* size of uint64 */) | |||
| for j := 0; j < len(s.Locarraypos[i]); j++ { | |||
| sizeInBytes += uint64(len(s.Locarraypos[i][j])*8 /* size of uint64 */) + | |||
| segment.SizeOfSlice | |||
| } | |||
| sizeInBytes += (segment.SizeOfSlice * 5) | |||
| } | |||
| sizeInBytes += (segment.SizeOfSlice * 5) | |||
| // Stored data | |||
| for i := 0; i < len(s.Stored); i++ { | |||
| for _, v := range s.Stored[i] { | |||
| sizeInBytes += uint64(2 /* size of uint16 */) | |||
| for _, arr := range v { | |||
| sizeInBytes += uint64(len(arr)) + segment.SizeOfSlice | |||
| } | |||
| sizeInBytes += segment.SizeOfSlice | |||
| } | |||
| for _, v := range s.StoredTypes[i] { | |||
| sizeInBytes += uint64(2 /* size of uint16 */ +len(v)) + segment.SizeOfSlice | |||
| } | |||
| for _, v := range s.StoredPos[i] { | |||
| sizeInBytes += uint64(2 /* size of uint16 */) | |||
| for _, arr := range v { | |||
| sizeInBytes += uint64(len(arr)*8 /* size of uint64 */) + | |||
| segment.SizeOfSlice | |||
| } | |||
| sizeInBytes += segment.SizeOfSlice | |||
| } | |||
| // overhead from map(s) within Stored, StoredTypes, StoredPos | |||
| sizeInBytes += (segment.SizeOfMap * 3) | |||
| } | |||
| // overhead from data structures: Stored, StoredTypes, StoredPos | |||
| sizeInBytes += (segment.SizeOfSlice * 3) | |||
| // DocValueFields | |||
| sizeInBytes += uint64(len(s.DocValueFields)*3 /* size of uint16 + bool */) + | |||
| segment.SizeOfMap | |||
| // SizeInBytes | |||
| sizeInBytes += uint64(8) | |||
| s.sizeInBytes = sizeInBytes | |||
| } | |||
| func (s *Segment) SizeInBytes() uint64 { | |||
| return s.sizeInBytes | |||
| } | |||
| func (s *Segment) AddRef() { | |||
| } | |||
| func (s *Segment) DecRef() error { | |||
| return nil | |||
| } | |||
| // Fields returns the field names used in this segment | |||
| func (s *Segment) Fields() []string { | |||
| return s.FieldsInv | |||
| } | |||
| // VisitDocument invokes the DocFieldValueVistor for each stored field | |||
| // for the specified doc number | |||
| func (s *Segment) VisitDocument(num uint64, visitor segment.DocumentFieldValueVisitor) error { | |||
| // ensure document number exists | |||
| if int(num) > len(s.Stored)-1 { | |||
| return nil | |||
| } | |||
| docFields := s.Stored[int(num)] | |||
| st := s.StoredTypes[int(num)] | |||
| sp := s.StoredPos[int(num)] | |||
| for field, values := range docFields { | |||
| for i, value := range values { | |||
| keepGoing := visitor(s.FieldsInv[field], st[field][i], value, sp[field][i]) | |||
| if !keepGoing { | |||
| return nil | |||
| } | |||
| } | |||
| } | |||
| return nil | |||
| } | |||
| func (s *Segment) getField(name string) (int, error) { | |||
| fieldID, ok := s.FieldsMap[name] | |||
| if !ok { | |||
| return 0, fmt.Errorf("no field named %s", name) | |||
| } | |||
| return int(fieldID - 1), nil | |||
| } | |||
| // Dictionary returns the term dictionary for the specified field | |||
| func (s *Segment) Dictionary(field string) (segment.TermDictionary, error) { | |||
| fieldID, err := s.getField(field) | |||
| if err != nil { | |||
| // no such field, return empty dictionary | |||
| return &segment.EmptyDictionary{}, nil | |||
| } | |||
| return &Dictionary{ | |||
| segment: s, | |||
| field: field, | |||
| fieldID: uint16(fieldID), | |||
| }, nil | |||
| } | |||
| // Count returns the number of documents in this segment | |||
| // (this has no notion of deleted docs) | |||
| func (s *Segment) Count() uint64 { | |||
| return uint64(len(s.Stored)) | |||
| } | |||
| // DocNumbers returns a bitset corresponding to the doc numbers of all the | |||
| // provided _id strings | |||
| func (s *Segment) DocNumbers(ids []string) (*roaring.Bitmap, error) { | |||
| rv := roaring.New() | |||
| // guard against empty segment | |||
| if len(s.FieldsMap) > 0 { | |||
| idDictionary := s.Dicts[idFieldID] | |||
| for _, id := range ids { | |||
| postingID := idDictionary[id] | |||
| if postingID > 0 { | |||
| rv.Or(s.Postings[postingID-1]) | |||
| } | |||
| } | |||
| } | |||
| return rv, nil | |||
| } | |||
| // Close releases all resources associated with this segment | |||
| func (s *Segment) Close() error { | |||
| return nil | |||
| } | |||
| @@ -0,0 +1,75 @@ | |||
| // Copyright (c) 2018 Couchbase, Inc. | |||
| // | |||
| // Licensed under the Apache License, Version 2.0 (the "License"); | |||
| // you may not use this file except in compliance with the License. | |||
| // You may obtain a copy of the License at | |||
| // | |||
| // http://www.apache.org/licenses/LICENSE-2.0 | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software | |||
| // distributed under the License is distributed on an "AS IS" BASIS, | |||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| // See the License for the specific language governing permissions and | |||
| // limitations under the License. | |||
| package segment | |||
| import ( | |||
| "regexp/syntax" | |||
| "github.com/couchbase/vellum/regexp" | |||
| ) | |||
| func ParseRegexp(pattern string) (a *regexp.Regexp, prefixBeg, prefixEnd []byte, err error) { | |||
| // TODO: potential optimization where syntax.Regexp supports a Simplify() API? | |||
| parsed, err := syntax.Parse(pattern, syntax.Perl) | |||
| if err != nil { | |||
| return nil, nil, nil, err | |||
| } | |||
| re, err := regexp.NewParsedWithLimit(pattern, parsed, regexp.DefaultLimit) | |||
| if err != nil { | |||
| return nil, nil, nil, err | |||
| } | |||
| prefix := LiteralPrefix(parsed) | |||
| if prefix != "" { | |||
| prefixBeg := []byte(prefix) | |||
| prefixEnd := IncrementBytes(prefixBeg) | |||
| return re, prefixBeg, prefixEnd, nil | |||
| } | |||
| return re, nil, nil, nil | |||
| } | |||
| // Returns the literal prefix given the parse tree for a regexp | |||
| func LiteralPrefix(s *syntax.Regexp) string { | |||
| // traverse the left-most branch in the parse tree as long as the | |||
| // node represents a concatenation | |||
| for s != nil && s.Op == syntax.OpConcat { | |||
| if len(s.Sub) < 1 { | |||
| return "" | |||
| } | |||
| s = s.Sub[0] | |||
| } | |||
| if s.Op == syntax.OpLiteral { | |||
| return string(s.Rune) | |||
| } | |||
| return "" // no literal prefix | |||
| } | |||
| func IncrementBytes(in []byte) []byte { | |||
| rv := make([]byte, len(in)) | |||
| copy(rv, in) | |||
| for i := len(rv) - 1; i >= 0; i-- { | |||
| rv[i] = rv[i] + 1 | |||
| if rv[i] != 0 { | |||
| return rv // didn't overflow, so stop | |||
| } | |||
| } | |||
| return nil // overflowed | |||
| } | |||
| @@ -15,15 +15,14 @@ | |||
| package segment | |||
| import ( | |||
| "fmt" | |||
| "github.com/RoaringBitmap/roaring" | |||
| "github.com/blevesearch/bleve/index" | |||
| "github.com/couchbase/vellum" | |||
| ) | |||
| // Overhead from go data structures when deployed on a 64-bit system. | |||
| const SizeOfMap uint64 = 8 | |||
| const SizeOfPointer uint64 = 8 | |||
| const SizeOfSlice uint64 = 24 | |||
| const SizeOfString uint64 = 16 | |||
| var ErrClosed = fmt.Errorf("index closed") | |||
| // DocumentFieldValueVisitor defines a callback to be visited for each | |||
| // stored field value. The return value determines if the visitor | |||
| @@ -34,6 +33,9 @@ type Segment interface { | |||
| Dictionary(field string) (TermDictionary, error) | |||
| VisitDocument(num uint64, visitor DocumentFieldValueVisitor) error | |||
| DocID(num uint64) ([]byte, error) | |||
| Count() uint64 | |||
| DocNumbers([]string) (*roaring.Bitmap, error) | |||
| @@ -42,18 +44,21 @@ type Segment interface { | |||
| Close() error | |||
| SizeInBytes() uint64 | |||
| Size() int | |||
| AddRef() | |||
| DecRef() error | |||
| } | |||
| type TermDictionary interface { | |||
| PostingsList(term string, except *roaring.Bitmap) (PostingsList, error) | |||
| PostingsList(term []byte, except *roaring.Bitmap, prealloc PostingsList) (PostingsList, error) | |||
| Iterator() DictionaryIterator | |||
| PrefixIterator(prefix string) DictionaryIterator | |||
| RangeIterator(start, end string) DictionaryIterator | |||
| AutomatonIterator(a vellum.Automaton, | |||
| startKeyInclusive, endKeyExclusive []byte) DictionaryIterator | |||
| OnlyIterator(onlyTerms [][]byte, includeCount bool) DictionaryIterator | |||
| } | |||
| type DictionaryIterator interface { | |||
| @@ -61,7 +66,9 @@ type DictionaryIterator interface { | |||
| } | |||
| type PostingsList interface { | |||
| Iterator() PostingsIterator | |||
| Iterator(includeFreq, includeNorm, includeLocations bool, prealloc PostingsIterator) PostingsIterator | |||
| Size() int | |||
| Count() uint64 | |||
| @@ -77,6 +84,14 @@ type PostingsIterator interface { | |||
| // implementations may return a shared instance to reduce memory | |||
| // allocations. | |||
| Next() (Posting, error) | |||
| // Advance will return the posting with the specified doc number | |||
| // or if there is no such posting, the next posting. | |||
| // Callers MUST NOT attempt to pass a docNum that is less than or | |||
| // equal to the currently visited posting doc Num. | |||
| Advance(docNum uint64) (Posting, error) | |||
| Size() int | |||
| } | |||
| type Posting interface { | |||
| @@ -86,6 +101,8 @@ type Posting interface { | |||
| Norm() float64 | |||
| Locations() []Location | |||
| Size() int | |||
| } | |||
| type Location interface { | |||
| @@ -94,6 +111,7 @@ type Location interface { | |||
| End() uint64 | |||
| Pos() uint64 | |||
| ArrayPositions() []uint64 | |||
| Size() int | |||
| } | |||
| // DocumentFieldTermVisitable is implemented by various scorch segment | |||
| @@ -101,10 +119,17 @@ type Location interface { | |||
| // postings or other indexed values. | |||
| type DocumentFieldTermVisitable interface { | |||
| VisitDocumentFieldTerms(localDocNum uint64, fields []string, | |||
| visitor index.DocumentFieldTermVisitor) error | |||
| visitor index.DocumentFieldTermVisitor, optional DocVisitState) (DocVisitState, error) | |||
| // VisitableDocValueFields implementation should return | |||
| // the list of fields which are document value persisted and | |||
| // therefore visitable by the above VisitDocumentFieldTerms method. | |||
| VisitableDocValueFields() ([]string, error) | |||
| } | |||
| type DocVisitState interface { | |||
| } | |||
| type StatsReporter interface { | |||
| ReportBytesWritten(bytesWritten uint64) | |||
| } | |||
| @@ -16,19 +16,13 @@ package zap | |||
| import ( | |||
| "bufio" | |||
| "bytes" | |||
| "encoding/binary" | |||
| "math" | |||
| "os" | |||
| "sort" | |||
| "github.com/Smerity/govarint" | |||
| "github.com/blevesearch/bleve/index/scorch/segment/mem" | |||
| "github.com/couchbase/vellum" | |||
| "github.com/golang/snappy" | |||
| ) | |||
| const version uint32 = 3 | |||
| const Version uint32 = 11 | |||
| const Type string = "zap" | |||
| const fieldNotUninverted = math.MaxUint64 | |||
| @@ -82,219 +76,39 @@ func PersistSegmentBase(sb *SegmentBase, path string) error { | |||
| return nil | |||
| } | |||
| // PersistSegment takes the in-memory segment and persists it to | |||
| // the specified path in the zap file format. | |||
| func PersistSegment(memSegment *mem.Segment, path string, chunkFactor uint32) error { | |||
| flag := os.O_RDWR | os.O_CREATE | |||
| f, err := os.OpenFile(path, flag, 0600) | |||
| if err != nil { | |||
| return err | |||
| } | |||
| cleanup := func() { | |||
| _ = f.Close() | |||
| _ = os.Remove(path) | |||
| } | |||
| // buffer the output | |||
| br := bufio.NewWriter(f) | |||
| // wrap it for counting (tracking offsets) | |||
| cr := NewCountHashWriter(br) | |||
| numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, _, err := | |||
| persistBase(memSegment, cr, chunkFactor) | |||
| if err != nil { | |||
| cleanup() | |||
| return err | |||
| } | |||
| err = persistFooter(numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, | |||
| chunkFactor, cr.Sum32(), cr) | |||
| if err != nil { | |||
| cleanup() | |||
| return err | |||
| } | |||
| err = br.Flush() | |||
| if err != nil { | |||
| cleanup() | |||
| return err | |||
| } | |||
| err = f.Sync() | |||
| if err != nil { | |||
| cleanup() | |||
| return err | |||
| } | |||
| err = f.Close() | |||
| if err != nil { | |||
| cleanup() | |||
| return err | |||
| } | |||
| return nil | |||
| } | |||
| func persistBase(memSegment *mem.Segment, cr *CountHashWriter, chunkFactor uint32) ( | |||
| numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset uint64, | |||
| dictLocs []uint64, err error) { | |||
| docValueOffset = uint64(fieldNotUninverted) | |||
| if len(memSegment.Stored) > 0 { | |||
| storedIndexOffset, err = persistStored(memSegment, cr) | |||
| if err != nil { | |||
| return 0, 0, 0, 0, nil, err | |||
| } | |||
| freqOffsets, locOffsets, err := persistPostingDetails(memSegment, cr, chunkFactor) | |||
| if err != nil { | |||
| return 0, 0, 0, 0, nil, err | |||
| } | |||
| postingsListLocs, err := persistPostingsLocs(memSegment, cr) | |||
| if err != nil { | |||
| return 0, 0, 0, 0, nil, err | |||
| } | |||
| postingsLocs, err := persistPostingsLists(memSegment, cr, postingsListLocs, freqOffsets, locOffsets) | |||
| if err != nil { | |||
| return 0, 0, 0, 0, nil, err | |||
| } | |||
| dictLocs, err = persistDictionary(memSegment, cr, postingsLocs) | |||
| if err != nil { | |||
| return 0, 0, 0, 0, nil, err | |||
| } | |||
| docValueOffset, err = persistFieldDocValues(memSegment, cr, chunkFactor) | |||
| if err != nil { | |||
| return 0, 0, 0, 0, nil, err | |||
| } | |||
| } else { | |||
| dictLocs = make([]uint64, len(memSegment.FieldsInv)) | |||
| } | |||
| fieldsIndexOffset, err = persistFields(memSegment.FieldsInv, cr, dictLocs) | |||
| if err != nil { | |||
| return 0, 0, 0, 0, nil, err | |||
| } | |||
| return uint64(len(memSegment.Stored)), storedIndexOffset, fieldsIndexOffset, docValueOffset, | |||
| dictLocs, nil | |||
| } | |||
| func persistStored(memSegment *mem.Segment, w *CountHashWriter) (uint64, error) { | |||
| var curr int | |||
| var metaBuf bytes.Buffer | |||
| var data, compressed []byte | |||
| metaEncoder := govarint.NewU64Base128Encoder(&metaBuf) | |||
| docNumOffsets := make(map[int]uint64, len(memSegment.Stored)) | |||
| for docNum, storedValues := range memSegment.Stored { | |||
| if docNum != 0 { | |||
| // reset buffer if necessary | |||
| curr = 0 | |||
| metaBuf.Reset() | |||
| data = data[:0] | |||
| compressed = compressed[:0] | |||
| } | |||
| st := memSegment.StoredTypes[docNum] | |||
| sp := memSegment.StoredPos[docNum] | |||
| // encode fields in order | |||
| for fieldID := range memSegment.FieldsInv { | |||
| if storedFieldValues, ok := storedValues[uint16(fieldID)]; ok { | |||
| stf := st[uint16(fieldID)] | |||
| spf := sp[uint16(fieldID)] | |||
| var err2 error | |||
| curr, data, err2 = persistStoredFieldValues(fieldID, | |||
| storedFieldValues, stf, spf, curr, metaEncoder, data) | |||
| if err2 != nil { | |||
| return 0, err2 | |||
| } | |||
| } | |||
| } | |||
| metaEncoder.Close() | |||
| metaBytes := metaBuf.Bytes() | |||
| // compress the data | |||
| compressed = snappy.Encode(compressed, data) | |||
| // record where we're about to start writing | |||
| docNumOffsets[docNum] = uint64(w.Count()) | |||
| // write out the meta len and compressed data len | |||
| _, err := writeUvarints(w, uint64(len(metaBytes)), uint64(len(compressed))) | |||
| if err != nil { | |||
| return 0, err | |||
| } | |||
| // now write the meta | |||
| _, err = w.Write(metaBytes) | |||
| if err != nil { | |||
| return 0, err | |||
| } | |||
| // now write the compressed data | |||
| _, err = w.Write(compressed) | |||
| if err != nil { | |||
| return 0, err | |||
| } | |||
| } | |||
| // return value is the start of the stored index | |||
| rv := uint64(w.Count()) | |||
| // now write out the stored doc index | |||
| for docNum := range memSegment.Stored { | |||
| err := binary.Write(w, binary.BigEndian, docNumOffsets[docNum]) | |||
| if err != nil { | |||
| return 0, err | |||
| } | |||
| } | |||
| return rv, nil | |||
| } | |||
| func persistStoredFieldValues(fieldID int, | |||
| storedFieldValues [][]byte, stf []byte, spf [][]uint64, | |||
| curr int, metaEncoder *govarint.Base128Encoder, data []byte) ( | |||
| curr int, metaEncode varintEncoder, data []byte) ( | |||
| int, []byte, error) { | |||
| for i := 0; i < len(storedFieldValues); i++ { | |||
| // encode field | |||
| _, err := metaEncoder.PutU64(uint64(fieldID)) | |||
| _, err := metaEncode(uint64(fieldID)) | |||
| if err != nil { | |||
| return 0, nil, err | |||
| } | |||
| // encode type | |||
| _, err = metaEncoder.PutU64(uint64(stf[i])) | |||
| _, err = metaEncode(uint64(stf[i])) | |||
| if err != nil { | |||
| return 0, nil, err | |||
| } | |||
| // encode start offset | |||
| _, err = metaEncoder.PutU64(uint64(curr)) | |||
| _, err = metaEncode(uint64(curr)) | |||
| if err != nil { | |||
| return 0, nil, err | |||
| } | |||
| // end len | |||
| _, err = metaEncoder.PutU64(uint64(len(storedFieldValues[i]))) | |||
| _, err = metaEncode(uint64(len(storedFieldValues[i]))) | |||
| if err != nil { | |||
| return 0, nil, err | |||
| } | |||
| // encode number of array pos | |||
| _, err = metaEncoder.PutU64(uint64(len(spf[i]))) | |||
| _, err = metaEncode(uint64(len(spf[i]))) | |||
| if err != nil { | |||
| return 0, nil, err | |||
| } | |||
| // encode all array positions | |||
| for _, pos := range spf[i] { | |||
| _, err = metaEncoder.PutU64(pos) | |||
| _, err = metaEncode(pos) | |||
| if err != nil { | |||
| return 0, nil, err | |||
| } | |||
| @@ -307,337 +121,6 @@ func persistStoredFieldValues(fieldID int, | |||
| return curr, data, nil | |||
| } | |||
| func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFactor uint32) ([]uint64, []uint64, error) { | |||
| var freqOffsets, locOfffsets []uint64 | |||
| tfEncoder := newChunkedIntCoder(uint64(chunkFactor), uint64(len(memSegment.Stored)-1)) | |||
| for postingID := range memSegment.Postings { | |||
| if postingID != 0 { | |||
| tfEncoder.Reset() | |||
| } | |||
| freqs := memSegment.Freqs[postingID] | |||
| norms := memSegment.Norms[postingID] | |||
| postingsListItr := memSegment.Postings[postingID].Iterator() | |||
| var offset int | |||
| for postingsListItr.HasNext() { | |||
| docNum := uint64(postingsListItr.Next()) | |||
| // put freq | |||
| err := tfEncoder.Add(docNum, freqs[offset]) | |||
| if err != nil { | |||
| return nil, nil, err | |||
| } | |||
| // put norm | |||
| norm := norms[offset] | |||
| normBits := math.Float32bits(norm) | |||
| err = tfEncoder.Add(docNum, uint64(normBits)) | |||
| if err != nil { | |||
| return nil, nil, err | |||
| } | |||
| offset++ | |||
| } | |||
| // record where this postings freq info starts | |||
| freqOffsets = append(freqOffsets, uint64(w.Count())) | |||
| tfEncoder.Close() | |||
| _, err := tfEncoder.Write(w) | |||
| if err != nil { | |||
| return nil, nil, err | |||
| } | |||
| } | |||
| // now do it again for the locations | |||
| locEncoder := newChunkedIntCoder(uint64(chunkFactor), uint64(len(memSegment.Stored)-1)) | |||
| for postingID := range memSegment.Postings { | |||
| if postingID != 0 { | |||
| locEncoder.Reset() | |||
| } | |||
| freqs := memSegment.Freqs[postingID] | |||
| locfields := memSegment.Locfields[postingID] | |||
| locpos := memSegment.Locpos[postingID] | |||
| locstarts := memSegment.Locstarts[postingID] | |||
| locends := memSegment.Locends[postingID] | |||
| locarraypos := memSegment.Locarraypos[postingID] | |||
| postingsListItr := memSegment.Postings[postingID].Iterator() | |||
| var offset int | |||
| var locOffset int | |||
| for postingsListItr.HasNext() { | |||
| docNum := uint64(postingsListItr.Next()) | |||
| for i := 0; i < int(freqs[offset]); i++ { | |||
| if len(locfields) > 0 { | |||
| // put field | |||
| err := locEncoder.Add(docNum, uint64(locfields[locOffset])) | |||
| if err != nil { | |||
| return nil, nil, err | |||
| } | |||
| // put pos | |||
| err = locEncoder.Add(docNum, locpos[locOffset]) | |||
| if err != nil { | |||
| return nil, nil, err | |||
| } | |||
| // put start | |||
| err = locEncoder.Add(docNum, locstarts[locOffset]) | |||
| if err != nil { | |||
| return nil, nil, err | |||
| } | |||
| // put end | |||
| err = locEncoder.Add(docNum, locends[locOffset]) | |||
| if err != nil { | |||
| return nil, nil, err | |||
| } | |||
| // put the number of array positions to follow | |||
| num := len(locarraypos[locOffset]) | |||
| err = locEncoder.Add(docNum, uint64(num)) | |||
| if err != nil { | |||
| return nil, nil, err | |||
| } | |||
| // put each array position | |||
| for _, pos := range locarraypos[locOffset] { | |||
| err = locEncoder.Add(docNum, pos) | |||
| if err != nil { | |||
| return nil, nil, err | |||
| } | |||
| } | |||
| } | |||
| locOffset++ | |||
| } | |||
| offset++ | |||
| } | |||
| // record where this postings loc info starts | |||
| locOfffsets = append(locOfffsets, uint64(w.Count())) | |||
| locEncoder.Close() | |||
| _, err := locEncoder.Write(w) | |||
| if err != nil { | |||
| return nil, nil, err | |||
| } | |||
| } | |||
| return freqOffsets, locOfffsets, nil | |||
| } | |||
| func persistPostingsLocs(memSegment *mem.Segment, w *CountHashWriter) (rv []uint64, err error) { | |||
| rv = make([]uint64, 0, len(memSegment.PostingsLocs)) | |||
| var reuseBuf bytes.Buffer | |||
| reuseBufVarint := make([]byte, binary.MaxVarintLen64) | |||
| for postingID := range memSegment.PostingsLocs { | |||
| // record where we start this posting loc | |||
| rv = append(rv, uint64(w.Count())) | |||
| // write out the length and bitmap | |||
| _, err = writeRoaringWithLen(memSegment.PostingsLocs[postingID], w, &reuseBuf, reuseBufVarint) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| } | |||
| return rv, nil | |||
| } | |||
| func persistPostingsLists(memSegment *mem.Segment, w *CountHashWriter, | |||
| postingsListLocs, freqOffsets, locOffsets []uint64) (rv []uint64, err error) { | |||
| rv = make([]uint64, 0, len(memSegment.Postings)) | |||
| var reuseBuf bytes.Buffer | |||
| reuseBufVarint := make([]byte, binary.MaxVarintLen64) | |||
| for postingID := range memSegment.Postings { | |||
| // record where we start this posting list | |||
| rv = append(rv, uint64(w.Count())) | |||
| // write out the term info, loc info, and loc posting list offset | |||
| _, err = writeUvarints(w, freqOffsets[postingID], | |||
| locOffsets[postingID], postingsListLocs[postingID]) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| // write out the length and bitmap | |||
| _, err = writeRoaringWithLen(memSegment.Postings[postingID], w, &reuseBuf, reuseBufVarint) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| } | |||
| return rv, nil | |||
| } | |||
| func persistDictionary(memSegment *mem.Segment, w *CountHashWriter, postingsLocs []uint64) ([]uint64, error) { | |||
| rv := make([]uint64, 0, len(memSegment.DictKeys)) | |||
| varintBuf := make([]byte, binary.MaxVarintLen64) | |||
| var buffer bytes.Buffer | |||
| for fieldID, fieldTerms := range memSegment.DictKeys { | |||
| if fieldID != 0 { | |||
| buffer.Reset() | |||
| } | |||
| // start a new vellum for this field | |||
| builder, err := vellum.New(&buffer, nil) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| dict := memSegment.Dicts[fieldID] | |||
| // now walk the dictionary in order of fieldTerms (already sorted) | |||
| for _, fieldTerm := range fieldTerms { | |||
| postingID := dict[fieldTerm] - 1 | |||
| postingsAddr := postingsLocs[postingID] | |||
| err = builder.Insert([]byte(fieldTerm), postingsAddr) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| } | |||
| err = builder.Close() | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| // record where this dictionary starts | |||
| rv = append(rv, uint64(w.Count())) | |||
| vellumData := buffer.Bytes() | |||
| // write out the length of the vellum data | |||
| n := binary.PutUvarint(varintBuf, uint64(len(vellumData))) | |||
| _, err = w.Write(varintBuf[:n]) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| // write this vellum to disk | |||
| _, err = w.Write(vellumData) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| } | |||
| return rv, nil | |||
| } | |||
| type docIDRange []uint64 | |||
| func (a docIDRange) Len() int { return len(a) } | |||
| func (a docIDRange) Swap(i, j int) { a[i], a[j] = a[j], a[i] } | |||
| func (a docIDRange) Less(i, j int) bool { return a[i] < a[j] } | |||
| func persistDocValues(memSegment *mem.Segment, w *CountHashWriter, | |||
| chunkFactor uint32) (map[uint16]uint64, error) { | |||
| fieldChunkOffsets := make(map[uint16]uint64, len(memSegment.FieldsInv)) | |||
| fdvEncoder := newChunkedContentCoder(uint64(chunkFactor), uint64(len(memSegment.Stored)-1)) | |||
| for fieldID := range memSegment.DocValueFields { | |||
| field := memSegment.FieldsInv[fieldID] | |||
| docTermMap := make(map[uint64][]byte, 0) | |||
| dict, err := memSegment.Dictionary(field) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| dictItr := dict.Iterator() | |||
| next, err := dictItr.Next() | |||
| for err == nil && next != nil { | |||
| postings, err1 := dict.PostingsList(next.Term, nil) | |||
| if err1 != nil { | |||
| return nil, err | |||
| } | |||
| postingsItr := postings.Iterator() | |||
| nextPosting, err2 := postingsItr.Next() | |||
| for err2 == nil && nextPosting != nil { | |||
| docNum := nextPosting.Number() | |||
| docTermMap[docNum] = append(docTermMap[docNum], []byte(next.Term)...) | |||
| docTermMap[docNum] = append(docTermMap[docNum], termSeparator) | |||
| nextPosting, err2 = postingsItr.Next() | |||
| } | |||
| if err2 != nil { | |||
| return nil, err2 | |||
| } | |||
| next, err = dictItr.Next() | |||
| } | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| // sort wrt to docIDs | |||
| var docNumbers docIDRange | |||
| for k := range docTermMap { | |||
| docNumbers = append(docNumbers, k) | |||
| } | |||
| sort.Sort(docNumbers) | |||
| for _, docNum := range docNumbers { | |||
| err = fdvEncoder.Add(docNum, docTermMap[docNum]) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| } | |||
| fieldChunkOffsets[fieldID] = uint64(w.Count()) | |||
| err = fdvEncoder.Close() | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| // persist the doc value details for this field | |||
| _, err = fdvEncoder.Write(w) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| // reseting encoder for the next field | |||
| fdvEncoder.Reset() | |||
| } | |||
| return fieldChunkOffsets, nil | |||
| } | |||
| func persistFieldDocValues(memSegment *mem.Segment, w *CountHashWriter, | |||
| chunkFactor uint32) (uint64, error) { | |||
| fieldDvOffsets, err := persistDocValues(memSegment, w, chunkFactor) | |||
| if err != nil { | |||
| return 0, err | |||
| } | |||
| fieldDocValuesOffset := uint64(w.Count()) | |||
| buf := make([]byte, binary.MaxVarintLen64) | |||
| offset := uint64(0) | |||
| ok := true | |||
| for fieldID := range memSegment.FieldsInv { | |||
| // if the field isn't configured for docValue, then mark | |||
| // the offset accordingly | |||
| if offset, ok = fieldDvOffsets[uint16(fieldID)]; !ok { | |||
| offset = fieldNotUninverted | |||
| } | |||
| n := binary.PutUvarint(buf, uint64(offset)) | |||
| _, err := w.Write(buf[:n]) | |||
| if err != nil { | |||
| return 0, err | |||
| } | |||
| } | |||
| return fieldDocValuesOffset, nil | |||
| } | |||
| func NewSegmentBase(memSegment *mem.Segment, chunkFactor uint32) (*SegmentBase, error) { | |||
| var br bytes.Buffer | |||
| cr := NewCountHashWriter(&br) | |||
| numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, dictLocs, err := | |||
| persistBase(memSegment, cr, chunkFactor) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| return InitSegmentBase(br.Bytes(), cr.Sum32(), chunkFactor, | |||
| memSegment.FieldsMap, memSegment.FieldsInv, numDocs, | |||
| storedIndexOffset, fieldsIndexOffset, docValueOffset, dictLocs) | |||
| } | |||
| func InitSegmentBase(mem []byte, memCRC uint32, chunkFactor uint32, | |||
| fieldsMap map[string]uint16, fieldsInv []string, numDocs uint64, | |||
| storedIndexOffset uint64, fieldsIndexOffset uint64, docValueOffset uint64, | |||
| @@ -653,10 +136,11 @@ func InitSegmentBase(mem []byte, memCRC uint32, chunkFactor uint32, | |||
| fieldsIndexOffset: fieldsIndexOffset, | |||
| docValueOffset: docValueOffset, | |||
| dictLocs: dictLocs, | |||
| fieldDvIterMap: make(map[uint16]*docValueIterator), | |||
| fieldDvReaders: make(map[uint16]*docValueReader), | |||
| } | |||
| sb.updateSize() | |||
| err := sb.loadDvIterators() | |||
| err := sb.loadDvReaders() | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| @@ -18,41 +18,56 @@ import ( | |||
| "bytes" | |||
| "encoding/binary" | |||
| "io" | |||
| "reflect" | |||
| "github.com/golang/snappy" | |||
| ) | |||
| var reflectStaticSizeMetaData int | |||
| func init() { | |||
| var md MetaData | |||
| reflectStaticSizeMetaData = int(reflect.TypeOf(md).Size()) | |||
| } | |||
| var termSeparator byte = 0xff | |||
| var termSeparatorSplitSlice = []byte{termSeparator} | |||
| type chunkedContentCoder struct { | |||
| final []byte | |||
| chunkSize uint64 | |||
| currChunk uint64 | |||
| chunkLens []uint64 | |||
| final []byte | |||
| chunkSize uint64 | |||
| currChunk uint64 | |||
| chunkLens []uint64 | |||
| w io.Writer | |||
| progressiveWrite bool | |||
| chunkMetaBuf bytes.Buffer | |||
| chunkBuf bytes.Buffer | |||
| chunkMeta []MetaData | |||
| compressed []byte // temp buf for snappy compression | |||
| } | |||
| // MetaData represents the data information inside a | |||
| // chunk. | |||
| type MetaData struct { | |||
| DocNum uint64 // docNum of the data inside the chunk | |||
| DocDvLoc uint64 // starting offset for a given docid | |||
| DocDvLen uint64 // length of data inside the chunk for the given docid | |||
| DocNum uint64 // docNum of the data inside the chunk | |||
| DocDvOffset uint64 // offset of data inside the chunk for the given docid | |||
| } | |||
| // newChunkedContentCoder returns a new chunk content coder which | |||
| // packs data into chunks based on the provided chunkSize | |||
| func newChunkedContentCoder(chunkSize uint64, | |||
| maxDocNum uint64) *chunkedContentCoder { | |||
| func newChunkedContentCoder(chunkSize uint64, maxDocNum uint64, | |||
| w io.Writer, progressiveWrite bool) *chunkedContentCoder { | |||
| total := maxDocNum/chunkSize + 1 | |||
| rv := &chunkedContentCoder{ | |||
| chunkSize: chunkSize, | |||
| chunkLens: make([]uint64, total), | |||
| chunkMeta: make([]MetaData, 0, total), | |||
| chunkSize: chunkSize, | |||
| chunkLens: make([]uint64, total), | |||
| chunkMeta: make([]MetaData, 0, total), | |||
| w: w, | |||
| progressiveWrite: progressiveWrite, | |||
| } | |||
| return rv | |||
| @@ -88,7 +103,7 @@ func (c *chunkedContentCoder) flushContents() error { | |||
| // write out the metaData slice | |||
| for _, meta := range c.chunkMeta { | |||
| _, err := writeUvarints(&c.chunkMetaBuf, meta.DocNum, meta.DocDvLoc, meta.DocDvLen) | |||
| _, err := writeUvarints(&c.chunkMetaBuf, meta.DocNum, meta.DocDvOffset) | |||
| if err != nil { | |||
| return err | |||
| } | |||
| @@ -98,10 +113,19 @@ func (c *chunkedContentCoder) flushContents() error { | |||
| metaData := c.chunkMetaBuf.Bytes() | |||
| c.final = append(c.final, c.chunkMetaBuf.Bytes()...) | |||
| // write the compressed data to the final data | |||
| compressedData := snappy.Encode(nil, c.chunkBuf.Bytes()) | |||
| c.final = append(c.final, compressedData...) | |||
| c.compressed = snappy.Encode(c.compressed[:cap(c.compressed)], c.chunkBuf.Bytes()) | |||
| c.final = append(c.final, c.compressed...) | |||
| c.chunkLens[c.currChunk] = uint64(len(c.compressed) + len(metaData)) | |||
| if c.progressiveWrite { | |||
| _, err := c.w.Write(c.final) | |||
| if err != nil { | |||
| return err | |||
| } | |||
| c.final = c.final[:0] | |||
| } | |||
| c.chunkLens[c.currChunk] = uint64(len(compressedData) + len(metaData)) | |||
| return nil | |||
| } | |||
| @@ -122,7 +146,7 @@ func (c *chunkedContentCoder) Add(docNum uint64, vals []byte) error { | |||
| c.currChunk = chunk | |||
| } | |||
| // mark the starting offset for this doc | |||
| // get the starting offset for this doc | |||
| dvOffset := c.chunkBuf.Len() | |||
| dvSize, err := c.chunkBuf.Write(vals) | |||
| if err != nil { | |||
| @@ -130,38 +154,77 @@ func (c *chunkedContentCoder) Add(docNum uint64, vals []byte) error { | |||
| } | |||
| c.chunkMeta = append(c.chunkMeta, MetaData{ | |||
| DocNum: docNum, | |||
| DocDvLoc: uint64(dvOffset), | |||
| DocDvLen: uint64(dvSize), | |||
| DocNum: docNum, | |||
| DocDvOffset: uint64(dvOffset + dvSize), | |||
| }) | |||
| return nil | |||
| } | |||
| // Write commits all the encoded chunked contents to the provided writer. | |||
| func (c *chunkedContentCoder) Write(w io.Writer) (int, error) { | |||
| // | |||
| // | ..... data ..... | chunk offsets (varints) | |||
| // | position of chunk offsets (uint64) | number of offsets (uint64) | | |||
| // | |||
| func (c *chunkedContentCoder) Write() (int, error) { | |||
| var tw int | |||
| buf := make([]byte, binary.MaxVarintLen64) | |||
| // write out the number of chunks | |||
| n := binary.PutUvarint(buf, uint64(len(c.chunkLens))) | |||
| nw, err := w.Write(buf[:n]) | |||
| tw += nw | |||
| if err != nil { | |||
| return tw, err | |||
| if c.final != nil { | |||
| // write out the data section first | |||
| nw, err := c.w.Write(c.final) | |||
| tw += nw | |||
| if err != nil { | |||
| return tw, err | |||
| } | |||
| } | |||
| chunkOffsetsStart := uint64(tw) | |||
| if cap(c.final) < binary.MaxVarintLen64 { | |||
| c.final = make([]byte, binary.MaxVarintLen64) | |||
| } else { | |||
| c.final = c.final[0:binary.MaxVarintLen64] | |||
| } | |||
| // write out the chunk lens | |||
| for _, chunkLen := range c.chunkLens { | |||
| n := binary.PutUvarint(buf, uint64(chunkLen)) | |||
| nw, err = w.Write(buf[:n]) | |||
| chunkOffsets := modifyLengthsToEndOffsets(c.chunkLens) | |||
| // write out the chunk offsets | |||
| for _, chunkOffset := range chunkOffsets { | |||
| n := binary.PutUvarint(c.final, chunkOffset) | |||
| nw, err := c.w.Write(c.final[:n]) | |||
| tw += nw | |||
| if err != nil { | |||
| return tw, err | |||
| } | |||
| } | |||
| // write out the data | |||
| nw, err = w.Write(c.final) | |||
| chunkOffsetsLen := uint64(tw) - chunkOffsetsStart | |||
| c.final = c.final[0:8] | |||
| // write out the length of chunk offsets | |||
| binary.BigEndian.PutUint64(c.final, chunkOffsetsLen) | |||
| nw, err := c.w.Write(c.final) | |||
| tw += nw | |||
| if err != nil { | |||
| return tw, err | |||
| } | |||
| // write out the number of chunks | |||
| binary.BigEndian.PutUint64(c.final, uint64(len(c.chunkLens))) | |||
| nw, err = c.w.Write(c.final) | |||
| tw += nw | |||
| if err != nil { | |||
| return tw, err | |||
| } | |||
| c.final = c.final[:0] | |||
| return tw, nil | |||
| } | |||
| // ReadDocValueBoundary elicits the start, end offsets from a | |||
| // metaData header slice | |||
| func ReadDocValueBoundary(chunk int, metaHeaders []MetaData) (uint64, uint64) { | |||
| var start uint64 | |||
| if chunk > 0 { | |||
| start = metaHeaders[chunk-1].DocDvOffset | |||
| } | |||
| return start, metaHeaders[chunk].DocDvOffset | |||
| } | |||
| @@ -17,6 +17,8 @@ package zap | |||
| import ( | |||
| "hash/crc32" | |||
| "io" | |||
| "github.com/blevesearch/bleve/index/scorch/segment" | |||
| ) | |||
| // CountHashWriter is a wrapper around a Writer which counts the number of | |||
| @@ -25,6 +27,7 @@ type CountHashWriter struct { | |||
| w io.Writer | |||
| crc uint32 | |||
| n int | |||
| s segment.StatsReporter | |||
| } | |||
| // NewCountHashWriter returns a CountHashWriter which wraps the provided Writer | |||
| @@ -32,11 +35,18 @@ func NewCountHashWriter(w io.Writer) *CountHashWriter { | |||
| return &CountHashWriter{w: w} | |||
| } | |||
| func NewCountHashWriterWithStatsReporter(w io.Writer, s segment.StatsReporter) *CountHashWriter { | |||
| return &CountHashWriter{w: w, s: s} | |||
| } | |||
| // Write writes the provided bytes to the wrapped writer and counts the bytes | |||
| func (c *CountHashWriter) Write(b []byte) (int, error) { | |||
| n, err := c.w.Write(b) | |||
| c.crc = crc32.Update(c.crc, crc32.IEEETable, b[:n]) | |||
| c.n += n | |||
| if c.s != nil { | |||
| c.s.ReportBytesWritten(uint64(n)) | |||
| } | |||
| return n, err | |||
| } | |||
| @@ -15,38 +15,51 @@ | |||
| package zap | |||
| import ( | |||
| "bytes" | |||
| "fmt" | |||
| "github.com/RoaringBitmap/roaring" | |||
| "github.com/blevesearch/bleve/index" | |||
| "github.com/blevesearch/bleve/index/scorch/segment" | |||
| "github.com/couchbase/vellum" | |||
| "github.com/couchbase/vellum/regexp" | |||
| ) | |||
| // Dictionary is the zap representation of the term dictionary | |||
| type Dictionary struct { | |||
| sb *SegmentBase | |||
| field string | |||
| fieldID uint16 | |||
| fst *vellum.FST | |||
| sb *SegmentBase | |||
| field string | |||
| fieldID uint16 | |||
| fst *vellum.FST | |||
| fstReader *vellum.Reader | |||
| } | |||
| // PostingsList returns the postings list for the specified term | |||
| func (d *Dictionary) PostingsList(term string, except *roaring.Bitmap) (segment.PostingsList, error) { | |||
| return d.postingsList([]byte(term), except, nil) | |||
| func (d *Dictionary) PostingsList(term []byte, except *roaring.Bitmap, | |||
| prealloc segment.PostingsList) (segment.PostingsList, error) { | |||
| var preallocPL *PostingsList | |||
| pl, ok := prealloc.(*PostingsList) | |||
| if ok && pl != nil { | |||
| preallocPL = pl | |||
| } | |||
| return d.postingsList(term, except, preallocPL) | |||
| } | |||
| func (d *Dictionary) postingsList(term []byte, except *roaring.Bitmap, rv *PostingsList) (*PostingsList, error) { | |||
| if d.fst == nil { | |||
| if d.fstReader == nil { | |||
| if rv == nil || rv == emptyPostingsList { | |||
| return emptyPostingsList, nil | |||
| } | |||
| return d.postingsListInit(rv, except), nil | |||
| } | |||
| postingsOffset, exists, err := d.fst.Get(term) | |||
| postingsOffset, exists, err := d.fstReader.Get(term) | |||
| if err != nil { | |||
| return nil, fmt.Errorf("vellum err: %v", err) | |||
| } | |||
| if !exists { | |||
| if rv == nil || rv == emptyPostingsList { | |||
| return emptyPostingsList, nil | |||
| } | |||
| return d.postingsListInit(rv, except), nil | |||
| } | |||
| @@ -65,10 +78,17 @@ func (d *Dictionary) postingsListFromOffset(postingsOffset uint64, except *roari | |||
| } | |||
| func (d *Dictionary) postingsListInit(rv *PostingsList, except *roaring.Bitmap) *PostingsList { | |||
| if rv == nil { | |||
| if rv == nil || rv == emptyPostingsList { | |||
| rv = &PostingsList{} | |||
| } else { | |||
| postings := rv.postings | |||
| if postings != nil { | |||
| postings.Clear() | |||
| } | |||
| *rv = PostingsList{} // clear the struct | |||
| rv.postings = postings | |||
| } | |||
| rv.sb = d.sb | |||
| rv.except = except | |||
| @@ -85,6 +105,8 @@ func (d *Dictionary) Iterator() segment.DictionaryIterator { | |||
| itr, err := d.fst.Iterator(nil, nil) | |||
| if err == nil { | |||
| rv.itr = itr | |||
| } else if err != vellum.ErrIteratorDone { | |||
| rv.err = err | |||
| } | |||
| } | |||
| @@ -98,13 +120,15 @@ func (d *Dictionary) PrefixIterator(prefix string) segment.DictionaryIterator { | |||
| d: d, | |||
| } | |||
| kBeg := []byte(prefix) | |||
| kEnd := segment.IncrementBytes(kBeg) | |||
| if d.fst != nil { | |||
| r, err := regexp.New(prefix + ".*") | |||
| itr, err := d.fst.Iterator(kBeg, kEnd) | |||
| if err == nil { | |||
| itr, err := d.fst.Search(r, nil, nil) | |||
| if err == nil { | |||
| rv.itr = itr | |||
| } | |||
| rv.itr = itr | |||
| } else if err != vellum.ErrIteratorDone { | |||
| rv.err = err | |||
| } | |||
| } | |||
| @@ -130,36 +154,103 @@ func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator | |||
| itr, err := d.fst.Iterator([]byte(start), endBytes) | |||
| if err == nil { | |||
| rv.itr = itr | |||
| } else if err != vellum.ErrIteratorDone { | |||
| rv.err = err | |||
| } | |||
| } | |||
| return rv | |||
| } | |||
| // AutomatonIterator returns an iterator which only visits terms | |||
| // having the the vellum automaton and start/end key range | |||
| func (d *Dictionary) AutomatonIterator(a vellum.Automaton, | |||
| startKeyInclusive, endKeyExclusive []byte) segment.DictionaryIterator { | |||
| rv := &DictionaryIterator{ | |||
| d: d, | |||
| } | |||
| if d.fst != nil { | |||
| itr, err := d.fst.Search(a, startKeyInclusive, endKeyExclusive) | |||
| if err == nil { | |||
| rv.itr = itr | |||
| } else if err != vellum.ErrIteratorDone { | |||
| rv.err = err | |||
| } | |||
| } | |||
| return rv | |||
| } | |||
| func (d *Dictionary) OnlyIterator(onlyTerms [][]byte, | |||
| includeCount bool) segment.DictionaryIterator { | |||
| rv := &DictionaryIterator{ | |||
| d: d, | |||
| omitCount: !includeCount, | |||
| } | |||
| var buf bytes.Buffer | |||
| builder, err := vellum.New(&buf, nil) | |||
| if err != nil { | |||
| rv.err = err | |||
| return rv | |||
| } | |||
| for _, term := range onlyTerms { | |||
| err = builder.Insert(term, 0) | |||
| if err != nil { | |||
| rv.err = err | |||
| return rv | |||
| } | |||
| } | |||
| err = builder.Close() | |||
| if err != nil { | |||
| rv.err = err | |||
| return rv | |||
| } | |||
| onlyFST, err := vellum.Load(buf.Bytes()) | |||
| if err != nil { | |||
| rv.err = err | |||
| return rv | |||
| } | |||
| itr, err := d.fst.Search(onlyFST, nil, nil) | |||
| if err == nil { | |||
| rv.itr = itr | |||
| } else if err != vellum.ErrIteratorDone { | |||
| rv.err = err | |||
| } | |||
| return rv | |||
| } | |||
| // DictionaryIterator is an iterator for term dictionary | |||
| type DictionaryIterator struct { | |||
| d *Dictionary | |||
| itr vellum.Iterator | |||
| err error | |||
| tmp PostingsList | |||
| d *Dictionary | |||
| itr vellum.Iterator | |||
| err error | |||
| tmp PostingsList | |||
| entry index.DictEntry | |||
| omitCount bool | |||
| } | |||
| // Next returns the next entry in the dictionary | |||
| func (i *DictionaryIterator) Next() (*index.DictEntry, error) { | |||
| if i.itr == nil || i.err == vellum.ErrIteratorDone { | |||
| return nil, nil | |||
| } else if i.err != nil { | |||
| if i.err != nil && i.err != vellum.ErrIteratorDone { | |||
| return nil, i.err | |||
| } else if i.itr == nil || i.err == vellum.ErrIteratorDone { | |||
| return nil, nil | |||
| } | |||
| term, postingsOffset := i.itr.Current() | |||
| i.err = i.tmp.read(postingsOffset, i.d) | |||
| if i.err != nil { | |||
| return nil, i.err | |||
| } | |||
| rv := &index.DictEntry{ | |||
| Term: string(term), | |||
| Count: i.tmp.Count(), | |||
| i.entry.Term = string(term) | |||
| if !i.omitCount { | |||
| i.err = i.tmp.read(postingsOffset, i.d) | |||
| if i.err != nil { | |||
| return nil, i.err | |||
| } | |||
| i.entry.Count = i.tmp.Count() | |||
| } | |||
| i.err = i.itr.Next() | |||
| return rv, nil | |||
| return &i.entry, nil | |||
| } | |||
| @@ -19,93 +19,129 @@ import ( | |||
| "encoding/binary" | |||
| "fmt" | |||
| "math" | |||
| "reflect" | |||
| "sort" | |||
| "github.com/blevesearch/bleve/index" | |||
| "github.com/blevesearch/bleve/index/scorch/segment" | |||
| "github.com/blevesearch/bleve/size" | |||
| "github.com/golang/snappy" | |||
| ) | |||
| type docValueIterator struct { | |||
| var reflectStaticSizedocValueReader int | |||
| func init() { | |||
| var dvi docValueReader | |||
| reflectStaticSizedocValueReader = int(reflect.TypeOf(dvi).Size()) | |||
| } | |||
| type docNumTermsVisitor func(docNum uint64, terms []byte) error | |||
| type docVisitState struct { | |||
| dvrs map[uint16]*docValueReader | |||
| segment *Segment | |||
| } | |||
| type docValueReader struct { | |||
| field string | |||
| curChunkNum uint64 | |||
| numChunks uint64 | |||
| chunkLens []uint64 | |||
| chunkOffsets []uint64 | |||
| dvDataLoc uint64 | |||
| curChunkHeader []MetaData | |||
| curChunkData []byte // compressed data cache | |||
| uncompressed []byte // temp buf for snappy decompression | |||
| } | |||
| func (di *docValueIterator) sizeInBytes() uint64 { | |||
| // curChunkNum, numChunks, dvDataLoc --> uint64 | |||
| sizeInBytes := 24 | |||
| // field | |||
| sizeInBytes += (len(di.field) + int(segment.SizeOfString)) | |||
| func (di *docValueReader) size() int { | |||
| return reflectStaticSizedocValueReader + size.SizeOfPtr + | |||
| len(di.field) + | |||
| len(di.chunkOffsets)*size.SizeOfUint64 + | |||
| len(di.curChunkHeader)*reflectStaticSizeMetaData + | |||
| len(di.curChunkData) | |||
| } | |||
| // chunkLens, curChunkHeader | |||
| sizeInBytes += len(di.chunkLens)*8 + | |||
| len(di.curChunkHeader)*24 + | |||
| int(segment.SizeOfSlice*2) /* overhead from slices */ | |||
| func (di *docValueReader) cloneInto(rv *docValueReader) *docValueReader { | |||
| if rv == nil { | |||
| rv = &docValueReader{} | |||
| } | |||
| // curChunkData is mmap'ed, not included | |||
| rv.field = di.field | |||
| rv.curChunkNum = math.MaxUint64 | |||
| rv.chunkOffsets = di.chunkOffsets // immutable, so it's sharable | |||
| rv.dvDataLoc = di.dvDataLoc | |||
| rv.curChunkHeader = rv.curChunkHeader[:0] | |||
| rv.curChunkData = nil | |||
| rv.uncompressed = rv.uncompressed[:0] | |||
| return uint64(sizeInBytes) | |||
| return rv | |||
| } | |||
| func (di *docValueIterator) fieldName() string { | |||
| func (di *docValueReader) fieldName() string { | |||
| return di.field | |||
| } | |||
| func (di *docValueIterator) curChunkNumber() uint64 { | |||
| func (di *docValueReader) curChunkNumber() uint64 { | |||
| return di.curChunkNum | |||
| } | |||
| func (s *SegmentBase) loadFieldDocValueIterator(field string, | |||
| fieldDvLoc uint64) (*docValueIterator, error) { | |||
| func (s *SegmentBase) loadFieldDocValueReader(field string, | |||
| fieldDvLocStart, fieldDvLocEnd uint64) (*docValueReader, error) { | |||
| // get the docValue offset for the given fields | |||
| if fieldDvLoc == fieldNotUninverted { | |||
| return nil, fmt.Errorf("loadFieldDocValueIterator: "+ | |||
| if fieldDvLocStart == fieldNotUninverted { | |||
| return nil, fmt.Errorf("loadFieldDocValueReader: "+ | |||
| "no docValues found for field: %s", field) | |||
| } | |||
| // read the number of chunks, chunk lengths | |||
| var offset, clen uint64 | |||
| numChunks, read := binary.Uvarint(s.mem[fieldDvLoc : fieldDvLoc+binary.MaxVarintLen64]) | |||
| if read <= 0 { | |||
| return nil, fmt.Errorf("failed to read the field "+ | |||
| "doc values for field %s", field) | |||
| // read the number of chunks, and chunk offsets position | |||
| var numChunks, chunkOffsetsPosition uint64 | |||
| if fieldDvLocEnd-fieldDvLocStart > 16 { | |||
| numChunks = binary.BigEndian.Uint64(s.mem[fieldDvLocEnd-8 : fieldDvLocEnd]) | |||
| // read the length of chunk offsets | |||
| chunkOffsetsLen := binary.BigEndian.Uint64(s.mem[fieldDvLocEnd-16 : fieldDvLocEnd-8]) | |||
| // acquire position of chunk offsets | |||
| chunkOffsetsPosition = (fieldDvLocEnd - 16) - chunkOffsetsLen | |||
| } | |||
| offset += uint64(read) | |||
| fdvIter := &docValueIterator{ | |||
| curChunkNum: math.MaxUint64, | |||
| field: field, | |||
| chunkLens: make([]uint64, int(numChunks)), | |||
| fdvIter := &docValueReader{ | |||
| curChunkNum: math.MaxUint64, | |||
| field: field, | |||
| chunkOffsets: make([]uint64, int(numChunks)), | |||
| } | |||
| // read the chunk offsets | |||
| var offset uint64 | |||
| for i := 0; i < int(numChunks); i++ { | |||
| clen, read = binary.Uvarint(s.mem[fieldDvLoc+offset : fieldDvLoc+offset+binary.MaxVarintLen64]) | |||
| loc, read := binary.Uvarint(s.mem[chunkOffsetsPosition+offset : chunkOffsetsPosition+offset+binary.MaxVarintLen64]) | |||
| if read <= 0 { | |||
| return nil, fmt.Errorf("corrupted chunk length during segment load") | |||
| return nil, fmt.Errorf("corrupted chunk offset during segment load") | |||
| } | |||
| fdvIter.chunkLens[i] = clen | |||
| fdvIter.chunkOffsets[i] = loc | |||
| offset += uint64(read) | |||
| } | |||
| fdvIter.dvDataLoc = fieldDvLoc + offset | |||
| // set the data offset | |||
| fdvIter.dvDataLoc = fieldDvLocStart | |||
| return fdvIter, nil | |||
| } | |||
| func (di *docValueIterator) loadDvChunk(chunkNumber, | |||
| localDocNum uint64, s *SegmentBase) error { | |||
| func (di *docValueReader) loadDvChunk(chunkNumber uint64, s *SegmentBase) error { | |||
| // advance to the chunk where the docValues | |||
| // reside for the given docNum | |||
| destChunkDataLoc := di.dvDataLoc | |||
| for i := 0; i < int(chunkNumber); i++ { | |||
| destChunkDataLoc += di.chunkLens[i] | |||
| destChunkDataLoc, curChunkEnd := di.dvDataLoc, di.dvDataLoc | |||
| start, end := readChunkBoundary(int(chunkNumber), di.chunkOffsets) | |||
| if start >= end { | |||
| di.curChunkHeader = di.curChunkHeader[:0] | |||
| di.curChunkData = nil | |||
| di.curChunkNum = chunkNumber | |||
| di.uncompressed = di.uncompressed[:0] | |||
| return nil | |||
| } | |||
| curChunkSize := di.chunkLens[chunkNumber] | |||
| destChunkDataLoc += start | |||
| curChunkEnd += end | |||
| // read the number of docs reside in the chunk | |||
| numDocs, read := binary.Uvarint(s.mem[destChunkDataLoc : destChunkDataLoc+binary.MaxVarintLen64]) | |||
| if read <= 0 { | |||
| @@ -114,38 +150,81 @@ func (di *docValueIterator) loadDvChunk(chunkNumber, | |||
| chunkMetaLoc := destChunkDataLoc + uint64(read) | |||
| offset := uint64(0) | |||
| di.curChunkHeader = make([]MetaData, int(numDocs)) | |||
| if cap(di.curChunkHeader) < int(numDocs) { | |||
| di.curChunkHeader = make([]MetaData, int(numDocs)) | |||
| } else { | |||
| di.curChunkHeader = di.curChunkHeader[:int(numDocs)] | |||
| } | |||
| for i := 0; i < int(numDocs); i++ { | |||
| di.curChunkHeader[i].DocNum, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) | |||
| offset += uint64(read) | |||
| di.curChunkHeader[i].DocDvLoc, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) | |||
| offset += uint64(read) | |||
| di.curChunkHeader[i].DocDvLen, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) | |||
| di.curChunkHeader[i].DocDvOffset, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) | |||
| offset += uint64(read) | |||
| } | |||
| compressedDataLoc := chunkMetaLoc + offset | |||
| dataLength := destChunkDataLoc + curChunkSize - compressedDataLoc | |||
| dataLength := curChunkEnd - compressedDataLoc | |||
| di.curChunkData = s.mem[compressedDataLoc : compressedDataLoc+dataLength] | |||
| di.curChunkNum = chunkNumber | |||
| di.uncompressed = di.uncompressed[:0] | |||
| return nil | |||
| } | |||
| func (di *docValueReader) iterateAllDocValues(s *SegmentBase, visitor docNumTermsVisitor) error { | |||
| for i := 0; i < len(di.chunkOffsets); i++ { | |||
| err := di.loadDvChunk(uint64(i), s) | |||
| if err != nil { | |||
| return err | |||
| } | |||
| if di.curChunkData == nil || len(di.curChunkHeader) == 0 { | |||
| continue | |||
| } | |||
| // uncompress the already loaded data | |||
| uncompressed, err := snappy.Decode(di.uncompressed[:cap(di.uncompressed)], di.curChunkData) | |||
| if err != nil { | |||
| return err | |||
| } | |||
| di.uncompressed = uncompressed | |||
| start := uint64(0) | |||
| for _, entry := range di.curChunkHeader { | |||
| err = visitor(entry.DocNum, uncompressed[start:entry.DocDvOffset]) | |||
| if err != nil { | |||
| return err | |||
| } | |||
| start = entry.DocDvOffset | |||
| } | |||
| } | |||
| return nil | |||
| } | |||
| func (di *docValueIterator) visitDocValues(docNum uint64, | |||
| func (di *docValueReader) visitDocValues(docNum uint64, | |||
| visitor index.DocumentFieldTermVisitor) error { | |||
| // binary search the term locations for the docNum | |||
| start, length := di.getDocValueLocs(docNum) | |||
| if start == math.MaxUint64 || length == math.MaxUint64 { | |||
| start, end := di.getDocValueLocs(docNum) | |||
| if start == math.MaxUint64 || end == math.MaxUint64 || start == end { | |||
| return nil | |||
| } | |||
| // uncompress the already loaded data | |||
| uncompressed, err := snappy.Decode(nil, di.curChunkData) | |||
| if err != nil { | |||
| return err | |||
| var uncompressed []byte | |||
| var err error | |||
| // use the uncompressed copy if available | |||
| if len(di.uncompressed) > 0 { | |||
| uncompressed = di.uncompressed | |||
| } else { | |||
| // uncompress the already loaded data | |||
| uncompressed, err = snappy.Decode(di.uncompressed[:cap(di.uncompressed)], di.curChunkData) | |||
| if err != nil { | |||
| return err | |||
| } | |||
| di.uncompressed = uncompressed | |||
| } | |||
| // pick the terms for the given docNum | |||
| uncompressed = uncompressed[start : start+length] | |||
| uncompressed = uncompressed[start:end] | |||
| for { | |||
| i := bytes.Index(uncompressed, termSeparatorSplitSlice) | |||
| if i < 0 { | |||
| @@ -159,55 +238,72 @@ func (di *docValueIterator) visitDocValues(docNum uint64, | |||
| return nil | |||
| } | |||
| func (di *docValueIterator) getDocValueLocs(docNum uint64) (uint64, uint64) { | |||
| func (di *docValueReader) getDocValueLocs(docNum uint64) (uint64, uint64) { | |||
| i := sort.Search(len(di.curChunkHeader), func(i int) bool { | |||
| return di.curChunkHeader[i].DocNum >= docNum | |||
| }) | |||
| if i < len(di.curChunkHeader) && di.curChunkHeader[i].DocNum == docNum { | |||
| return di.curChunkHeader[i].DocDvLoc, di.curChunkHeader[i].DocDvLen | |||
| return ReadDocValueBoundary(i, di.curChunkHeader) | |||
| } | |||
| return math.MaxUint64, math.MaxUint64 | |||
| } | |||
| // VisitDocumentFieldTerms is an implementation of the | |||
| // DocumentFieldTermVisitable interface | |||
| func (s *SegmentBase) VisitDocumentFieldTerms(localDocNum uint64, fields []string, | |||
| visitor index.DocumentFieldTermVisitor) error { | |||
| fieldIDPlus1 := uint16(0) | |||
| ok := true | |||
| func (s *Segment) VisitDocumentFieldTerms(localDocNum uint64, fields []string, | |||
| visitor index.DocumentFieldTermVisitor, dvsIn segment.DocVisitState) ( | |||
| segment.DocVisitState, error) { | |||
| dvs, ok := dvsIn.(*docVisitState) | |||
| if !ok || dvs == nil { | |||
| dvs = &docVisitState{} | |||
| } else { | |||
| if dvs.segment != s { | |||
| dvs.segment = s | |||
| dvs.dvrs = nil | |||
| } | |||
| } | |||
| var fieldIDPlus1 uint16 | |||
| if dvs.dvrs == nil { | |||
| dvs.dvrs = make(map[uint16]*docValueReader, len(fields)) | |||
| for _, field := range fields { | |||
| if fieldIDPlus1, ok = s.fieldsMap[field]; !ok { | |||
| continue | |||
| } | |||
| fieldID := fieldIDPlus1 - 1 | |||
| if dvIter, exists := s.fieldDvReaders[fieldID]; exists && | |||
| dvIter != nil { | |||
| dvs.dvrs[fieldID] = dvIter.cloneInto(dvs.dvrs[fieldID]) | |||
| } | |||
| } | |||
| } | |||
| // find the chunkNumber where the docValues are stored | |||
| docInChunk := localDocNum / uint64(s.chunkFactor) | |||
| var dvr *docValueReader | |||
| for _, field := range fields { | |||
| if fieldIDPlus1, ok = s.fieldsMap[field]; !ok { | |||
| continue | |||
| } | |||
| // find the chunkNumber where the docValues are stored | |||
| docInChunk := localDocNum / uint64(s.chunkFactor) | |||
| if dvIter, exists := s.fieldDvIterMap[fieldIDPlus1-1]; exists && | |||
| dvIter != nil { | |||
| fieldID := fieldIDPlus1 - 1 | |||
| if dvr, ok = dvs.dvrs[fieldID]; ok && dvr != nil { | |||
| // check if the chunk is already loaded | |||
| if docInChunk != dvIter.curChunkNumber() { | |||
| err := dvIter.loadDvChunk(docInChunk, localDocNum, s) | |||
| if docInChunk != dvr.curChunkNumber() { | |||
| err := dvr.loadDvChunk(docInChunk, &s.SegmentBase) | |||
| if err != nil { | |||
| continue | |||
| return dvs, err | |||
| } | |||
| } | |||
| _ = dvIter.visitDocValues(localDocNum, visitor) | |||
| _ = dvr.visitDocValues(localDocNum, visitor) | |||
| } | |||
| } | |||
| return nil | |||
| return dvs, nil | |||
| } | |||
| // VisitableDocValueFields returns the list of fields with | |||
| // persisted doc value terms ready to be visitable using the | |||
| // VisitDocumentFieldTerms method. | |||
| func (s *Segment) VisitableDocValueFields() ([]string, error) { | |||
| var rv []string | |||
| for fieldID, field := range s.fieldsInv { | |||
| if dvIter, ok := s.fieldDvIterMap[uint16(fieldID)]; ok && | |||
| dvIter != nil { | |||
| rv = append(rv, field) | |||
| } | |||
| } | |||
| return rv, nil | |||
| return s.fieldDvNames, nil | |||
| } | |||
| @@ -46,26 +46,27 @@ func newEnumerator(itrs []vellum.Iterator) (*enumerator, error) { | |||
| for i, itr := range rv.itrs { | |||
| rv.currKs[i], rv.currVs[i] = itr.Current() | |||
| } | |||
| rv.updateMatches() | |||
| if rv.lowK == nil { | |||
| rv.updateMatches(false) | |||
| if rv.lowK == nil && len(rv.lowIdxs) == 0 { | |||
| return rv, vellum.ErrIteratorDone | |||
| } | |||
| return rv, nil | |||
| } | |||
| // updateMatches maintains the low key matches based on the currKs | |||
| func (m *enumerator) updateMatches() { | |||
| func (m *enumerator) updateMatches(skipEmptyKey bool) { | |||
| m.lowK = nil | |||
| m.lowIdxs = m.lowIdxs[:0] | |||
| m.lowCurr = 0 | |||
| for i, key := range m.currKs { | |||
| if key == nil { | |||
| if (key == nil && m.currVs[i] == 0) || // in case of empty iterator | |||
| (len(key) == 0 && skipEmptyKey) { // skip empty keys | |||
| continue | |||
| } | |||
| cmp := bytes.Compare(key, m.lowK) | |||
| if cmp < 0 || m.lowK == nil { | |||
| if cmp < 0 || len(m.lowIdxs) == 0 { | |||
| // reached a new low | |||
| m.lowK = key | |||
| m.lowIdxs = m.lowIdxs[:0] | |||
| @@ -102,9 +103,10 @@ func (m *enumerator) Next() error { | |||
| } | |||
| m.currKs[vi], m.currVs[vi] = m.itrs[vi].Current() | |||
| } | |||
| m.updateMatches() | |||
| // can skip any empty keys encountered at this point | |||
| m.updateMatches(true) | |||
| } | |||
| if m.lowK == nil { | |||
| if m.lowK == nil && len(m.lowIdxs) == 0 { | |||
| return vellum.ErrIteratorDone | |||
| } | |||
| return nil | |||
| @@ -18,16 +18,12 @@ import ( | |||
| "bytes" | |||
| "encoding/binary" | |||
| "io" | |||
| "github.com/Smerity/govarint" | |||
| ) | |||
| type chunkedIntCoder struct { | |||
| final []byte | |||
| maxDocNum uint64 | |||
| chunkSize uint64 | |||
| chunkBuf bytes.Buffer | |||
| encoder *govarint.Base128Encoder | |||
| chunkLens []uint64 | |||
| currChunk uint64 | |||
| @@ -41,11 +37,9 @@ func newChunkedIntCoder(chunkSize uint64, maxDocNum uint64) *chunkedIntCoder { | |||
| total := maxDocNum/chunkSize + 1 | |||
| rv := &chunkedIntCoder{ | |||
| chunkSize: chunkSize, | |||
| maxDocNum: maxDocNum, | |||
| chunkLens: make([]uint64, total), | |||
| final: make([]byte, 0, 64), | |||
| } | |||
| rv.encoder = govarint.NewU64Base128Encoder(&rv.chunkBuf) | |||
| return rv | |||
| } | |||
| @@ -67,16 +61,18 @@ func (c *chunkedIntCoder) Add(docNum uint64, vals ...uint64) error { | |||
| chunk := docNum / c.chunkSize | |||
| if chunk != c.currChunk { | |||
| // starting a new chunk | |||
| if c.encoder != nil { | |||
| // close out last | |||
| c.Close() | |||
| c.chunkBuf.Reset() | |||
| } | |||
| c.Close() | |||
| c.chunkBuf.Reset() | |||
| c.currChunk = chunk | |||
| } | |||
| if len(c.buf) < binary.MaxVarintLen64 { | |||
| c.buf = make([]byte, binary.MaxVarintLen64) | |||
| } | |||
| for _, val := range vals { | |||
| _, err := c.encoder.PutU64(val) | |||
| wb := binary.PutUvarint(c.buf, val) | |||
| _, err := c.chunkBuf.Write(c.buf[:wb]) | |||
| if err != nil { | |||
| return err | |||
| } | |||
| @@ -85,13 +81,26 @@ func (c *chunkedIntCoder) Add(docNum uint64, vals ...uint64) error { | |||
| return nil | |||
| } | |||
| func (c *chunkedIntCoder) AddBytes(docNum uint64, buf []byte) error { | |||
| chunk := docNum / c.chunkSize | |||
| if chunk != c.currChunk { | |||
| // starting a new chunk | |||
| c.Close() | |||
| c.chunkBuf.Reset() | |||
| c.currChunk = chunk | |||
| } | |||
| _, err := c.chunkBuf.Write(buf) | |||
| return err | |||
| } | |||
| // Close indicates you are done calling Add() this allows the final chunk | |||
| // to be encoded. | |||
| func (c *chunkedIntCoder) Close() { | |||
| c.encoder.Close() | |||
| encodingBytes := c.chunkBuf.Bytes() | |||
| c.chunkLens[c.currChunk] = uint64(len(encodingBytes)) | |||
| c.final = append(c.final, encodingBytes...) | |||
| c.currChunk = uint64(cap(c.chunkLens)) // sentinel to detect double close | |||
| } | |||
| // Write commits all the encoded chunked integers to the provided writer. | |||
| @@ -102,10 +111,13 @@ func (c *chunkedIntCoder) Write(w io.Writer) (int, error) { | |||
| } | |||
| buf := c.buf | |||
| // write out the number of chunks & each chunkLen | |||
| n := binary.PutUvarint(buf, uint64(len(c.chunkLens))) | |||
| for _, chunkLen := range c.chunkLens { | |||
| n += binary.PutUvarint(buf[n:], uint64(chunkLen)) | |||
| // convert the chunk lengths into chunk offsets | |||
| chunkOffsets := modifyLengthsToEndOffsets(c.chunkLens) | |||
| // write out the number of chunks & each chunk offsets | |||
| n := binary.PutUvarint(buf, uint64(len(chunkOffsets))) | |||
| for _, chunkOffset := range chunkOffsets { | |||
| n += binary.PutUvarint(buf[n:], chunkOffset) | |||
| } | |||
| tw, err := w.Write(buf[:n]) | |||
| @@ -121,3 +133,40 @@ func (c *chunkedIntCoder) Write(w io.Writer) (int, error) { | |||
| } | |||
| return tw, nil | |||
| } | |||
| func (c *chunkedIntCoder) FinalSize() int { | |||
| return len(c.final) | |||
| } | |||
| // modifyLengthsToEndOffsets converts the chunk length array | |||
| // to a chunk offset array. The readChunkBoundary | |||
| // will figure out the start and end of every chunk from | |||
| // these offsets. Starting offset of i'th index is stored | |||
| // in i-1'th position except for 0'th index and ending offset | |||
| // is stored at i'th index position. | |||
| // For 0'th element, starting position is always zero. | |||
| // eg: | |||
| // Lens -> 5 5 5 5 => 5 10 15 20 | |||
| // Lens -> 0 5 0 5 => 0 5 5 10 | |||
| // Lens -> 0 0 0 5 => 0 0 0 5 | |||
| // Lens -> 5 0 0 0 => 5 5 5 5 | |||
| // Lens -> 0 5 0 0 => 0 5 5 5 | |||
| // Lens -> 0 0 5 0 => 0 0 5 5 | |||
| func modifyLengthsToEndOffsets(lengths []uint64) []uint64 { | |||
| var runningOffset uint64 | |||
| var index, i int | |||
| for i = 1; i <= len(lengths); i++ { | |||
| runningOffset += lengths[i-1] | |||
| lengths[index] = runningOffset | |||
| index++ | |||
| } | |||
| return lengths | |||
| } | |||
| func readChunkBoundary(chunk int, offsets []uint64) (uint64, uint64) { | |||
| var start uint64 | |||
| if chunk > 0 { | |||
| start = offsets[chunk-1] | |||
| } | |||
| return start, offsets[chunk] | |||
| } | |||
| @@ -24,11 +24,13 @@ import ( | |||
| "sort" | |||
| "github.com/RoaringBitmap/roaring" | |||
| "github.com/Smerity/govarint" | |||
| seg "github.com/blevesearch/bleve/index/scorch/segment" | |||
| "github.com/couchbase/vellum" | |||
| "github.com/golang/snappy" | |||
| ) | |||
| var DefaultFileMergerBufferSize = 1024 * 1024 | |||
| const docDropped = math.MaxUint64 // sentinel docNum to represent a deleted doc | |||
| // Merge takes a slice of zap segments and bit masks describing which | |||
| @@ -36,12 +38,24 @@ const docDropped = math.MaxUint64 // sentinel docNum to represent a deleted doc | |||
| // remaining data. This new segment is built at the specified path, | |||
| // with the provided chunkFactor. | |||
| func Merge(segments []*Segment, drops []*roaring.Bitmap, path string, | |||
| chunkFactor uint32) ([][]uint64, error) { | |||
| chunkFactor uint32, closeCh chan struct{}, s seg.StatsReporter) ( | |||
| [][]uint64, uint64, error) { | |||
| segmentBases := make([]*SegmentBase, len(segments)) | |||
| for segmenti, segment := range segments { | |||
| segmentBases[segmenti] = &segment.SegmentBase | |||
| } | |||
| return MergeSegmentBases(segmentBases, drops, path, chunkFactor, closeCh, s) | |||
| } | |||
| func MergeSegmentBases(segmentBases []*SegmentBase, drops []*roaring.Bitmap, path string, | |||
| chunkFactor uint32, closeCh chan struct{}, s seg.StatsReporter) ( | |||
| [][]uint64, uint64, error) { | |||
| flag := os.O_RDWR | os.O_CREATE | |||
| f, err := os.OpenFile(path, flag, 0600) | |||
| if err != nil { | |||
| return nil, err | |||
| return nil, 0, err | |||
| } | |||
| cleanup := func() { | |||
| @@ -49,54 +63,49 @@ func Merge(segments []*Segment, drops []*roaring.Bitmap, path string, | |||
| _ = os.Remove(path) | |||
| } | |||
| segmentBases := make([]*SegmentBase, len(segments)) | |||
| for segmenti, segment := range segments { | |||
| segmentBases[segmenti] = &segment.SegmentBase | |||
| } | |||
| // buffer the output | |||
| br := bufio.NewWriter(f) | |||
| br := bufio.NewWriterSize(f, DefaultFileMergerBufferSize) | |||
| // wrap it for counting (tracking offsets) | |||
| cr := NewCountHashWriter(br) | |||
| cr := NewCountHashWriterWithStatsReporter(br, s) | |||
| newDocNums, numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, _, _, _, err := | |||
| MergeToWriter(segmentBases, drops, chunkFactor, cr) | |||
| MergeToWriter(segmentBases, drops, chunkFactor, cr, closeCh) | |||
| if err != nil { | |||
| cleanup() | |||
| return nil, err | |||
| return nil, 0, err | |||
| } | |||
| err = persistFooter(numDocs, storedIndexOffset, fieldsIndexOffset, | |||
| docValueOffset, chunkFactor, cr.Sum32(), cr) | |||
| if err != nil { | |||
| cleanup() | |||
| return nil, err | |||
| return nil, 0, err | |||
| } | |||
| err = br.Flush() | |||
| if err != nil { | |||
| cleanup() | |||
| return nil, err | |||
| return nil, 0, err | |||
| } | |||
| err = f.Sync() | |||
| if err != nil { | |||
| cleanup() | |||
| return nil, err | |||
| return nil, 0, err | |||
| } | |||
| err = f.Close() | |||
| if err != nil { | |||
| cleanup() | |||
| return nil, err | |||
| return nil, 0, err | |||
| } | |||
| return newDocNums, nil | |||
| return newDocNums, uint64(cr.Count()), nil | |||
| } | |||
| func MergeToWriter(segments []*SegmentBase, drops []*roaring.Bitmap, | |||
| chunkFactor uint32, cr *CountHashWriter) ( | |||
| chunkFactor uint32, cr *CountHashWriter, closeCh chan struct{}) ( | |||
| newDocNums [][]uint64, | |||
| numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset uint64, | |||
| dictLocs []uint64, fieldsInv []string, fieldsMap map[string]uint16, | |||
| @@ -108,15 +117,21 @@ func MergeToWriter(segments []*SegmentBase, drops []*roaring.Bitmap, | |||
| fieldsMap = mapFields(fieldsInv) | |||
| numDocs = computeNewDocCount(segments, drops) | |||
| if isClosed(closeCh) { | |||
| return nil, 0, 0, 0, 0, nil, nil, nil, seg.ErrClosed | |||
| } | |||
| if numDocs > 0 { | |||
| storedIndexOffset, newDocNums, err = mergeStoredAndRemap(segments, drops, | |||
| fieldsMap, fieldsInv, fieldsSame, numDocs, cr) | |||
| fieldsMap, fieldsInv, fieldsSame, numDocs, cr, closeCh) | |||
| if err != nil { | |||
| return nil, 0, 0, 0, 0, nil, nil, nil, err | |||
| } | |||
| dictLocs, docValueOffset, err = persistMergedRest(segments, drops, fieldsInv, fieldsMap, | |||
| newDocNums, numDocs, chunkFactor, cr) | |||
| dictLocs, docValueOffset, err = persistMergedRest(segments, drops, | |||
| fieldsInv, fieldsMap, fieldsSame, | |||
| newDocNums, numDocs, chunkFactor, cr, closeCh) | |||
| if err != nil { | |||
| return nil, 0, 0, 0, 0, nil, nil, nil, err | |||
| } | |||
| @@ -156,11 +171,10 @@ func computeNewDocCount(segments []*SegmentBase, drops []*roaring.Bitmap) uint64 | |||
| } | |||
| func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, | |||
| fieldsInv []string, fieldsMap map[string]uint16, newDocNumsIn [][]uint64, | |||
| newSegDocCount uint64, chunkFactor uint32, | |||
| w *CountHashWriter) ([]uint64, uint64, error) { | |||
| fieldsInv []string, fieldsMap map[string]uint16, fieldsSame bool, | |||
| newDocNumsIn [][]uint64, newSegDocCount uint64, chunkFactor uint32, | |||
| w *CountHashWriter, closeCh chan struct{}) ([]uint64, uint64, error) { | |||
| var bufReuse bytes.Buffer | |||
| var bufMaxVarintLen64 []byte = make([]byte, binary.MaxVarintLen64) | |||
| var bufLoc []uint64 | |||
| @@ -168,28 +182,22 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, | |||
| var postItr *PostingsIterator | |||
| rv := make([]uint64, len(fieldsInv)) | |||
| fieldDvLocs := make([]uint64, len(fieldsInv)) | |||
| fieldDvLocsStart := make([]uint64, len(fieldsInv)) | |||
| fieldDvLocsEnd := make([]uint64, len(fieldsInv)) | |||
| tfEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1) | |||
| locEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1) | |||
| // docTermMap is keyed by docNum, where the array impl provides | |||
| // better memory usage behavior than a sparse-friendlier hashmap | |||
| // for when docs have much structural similarity (i.e., every doc | |||
| // has a given field) | |||
| var docTermMap [][]byte | |||
| var vellumBuf bytes.Buffer | |||
| newVellum, err := vellum.New(&vellumBuf, nil) | |||
| if err != nil { | |||
| return nil, 0, err | |||
| } | |||
| newRoaring := roaring.NewBitmap() | |||
| // for each field | |||
| for fieldID, fieldName := range fieldsInv { | |||
| if fieldID != 0 { | |||
| vellumBuf.Reset() | |||
| } | |||
| newVellum, err := vellum.New(&vellumBuf, nil) | |||
| if err != nil { | |||
| return nil, 0, err | |||
| } | |||
| // collect FST iterators from all active segments for this field | |||
| var newDocNums [][]uint64 | |||
| @@ -197,7 +205,15 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, | |||
| var dicts []*Dictionary | |||
| var itrs []vellum.Iterator | |||
| var segmentsInFocus []*SegmentBase | |||
| for segmentI, segment := range segments { | |||
| // check for the closure in meantime | |||
| if isClosed(closeCh) { | |||
| return nil, 0, seg.ErrClosed | |||
| } | |||
| dict, err2 := segment.dictionary(fieldName) | |||
| if err2 != nil { | |||
| return nil, 0, err2 | |||
| @@ -209,89 +225,63 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, | |||
| } | |||
| if itr != nil { | |||
| newDocNums = append(newDocNums, newDocNumsIn[segmentI]) | |||
| drops = append(drops, dropsIn[segmentI]) | |||
| if dropsIn[segmentI] != nil && !dropsIn[segmentI].IsEmpty() { | |||
| drops = append(drops, dropsIn[segmentI]) | |||
| } else { | |||
| drops = append(drops, nil) | |||
| } | |||
| dicts = append(dicts, dict) | |||
| itrs = append(itrs, itr) | |||
| segmentsInFocus = append(segmentsInFocus, segment) | |||
| } | |||
| } | |||
| } | |||
| if uint64(cap(docTermMap)) < newSegDocCount { | |||
| docTermMap = make([][]byte, newSegDocCount) | |||
| } else { | |||
| docTermMap = docTermMap[0:newSegDocCount] | |||
| for docNum := range docTermMap { // reset the docTermMap | |||
| docTermMap[docNum] = docTermMap[docNum][:0] | |||
| } | |||
| } | |||
| var prevTerm []byte | |||
| newRoaring := roaring.NewBitmap() | |||
| newRoaringLocs := roaring.NewBitmap() | |||
| newRoaring.Clear() | |||
| finishTerm := func(term []byte) error { | |||
| if term == nil { | |||
| return nil | |||
| var lastDocNum, lastFreq, lastNorm uint64 | |||
| // determines whether to use "1-hit" encoding optimization | |||
| // when a term appears in only 1 doc, with no loc info, | |||
| // has freq of 1, and the docNum fits into 31-bits | |||
| use1HitEncoding := func(termCardinality uint64) (bool, uint64, uint64) { | |||
| if termCardinality == uint64(1) && locEncoder.FinalSize() <= 0 { | |||
| docNum := uint64(newRoaring.Minimum()) | |||
| if under32Bits(docNum) && docNum == lastDocNum && lastFreq == 1 { | |||
| return true, docNum, lastNorm | |||
| } | |||
| } | |||
| return false, 0, 0 | |||
| } | |||
| finishTerm := func(term []byte) error { | |||
| tfEncoder.Close() | |||
| locEncoder.Close() | |||
| if newRoaring.GetCardinality() > 0 { | |||
| // this field/term actually has hits in the new segment, lets write it down | |||
| freqOffset := uint64(w.Count()) | |||
| _, err := tfEncoder.Write(w) | |||
| if err != nil { | |||
| return err | |||
| } | |||
| locOffset := uint64(w.Count()) | |||
| _, err = locEncoder.Write(w) | |||
| if err != nil { | |||
| return err | |||
| } | |||
| postingLocOffset := uint64(w.Count()) | |||
| _, err = writeRoaringWithLen(newRoaringLocs, w, &bufReuse, bufMaxVarintLen64) | |||
| if err != nil { | |||
| return err | |||
| } | |||
| postingOffset := uint64(w.Count()) | |||
| // write out the start of the term info | |||
| n := binary.PutUvarint(bufMaxVarintLen64, freqOffset) | |||
| _, err = w.Write(bufMaxVarintLen64[:n]) | |||
| if err != nil { | |||
| return err | |||
| } | |||
| // write out the start of the loc info | |||
| n = binary.PutUvarint(bufMaxVarintLen64, locOffset) | |||
| _, err = w.Write(bufMaxVarintLen64[:n]) | |||
| if err != nil { | |||
| return err | |||
| } | |||
| // write out the start of the posting locs | |||
| n = binary.PutUvarint(bufMaxVarintLen64, postingLocOffset) | |||
| _, err = w.Write(bufMaxVarintLen64[:n]) | |||
| if err != nil { | |||
| return err | |||
| } | |||
| _, err = writeRoaringWithLen(newRoaring, w, &bufReuse, bufMaxVarintLen64) | |||
| if err != nil { | |||
| return err | |||
| } | |||
| postingsOffset, err := writePostings(newRoaring, | |||
| tfEncoder, locEncoder, use1HitEncoding, w, bufMaxVarintLen64) | |||
| if err != nil { | |||
| return err | |||
| } | |||
| err = newVellum.Insert(term, postingOffset) | |||
| if postingsOffset > 0 { | |||
| err = newVellum.Insert(term, postingsOffset) | |||
| if err != nil { | |||
| return err | |||
| } | |||
| } | |||
| newRoaring = roaring.NewBitmap() | |||
| newRoaringLocs = roaring.NewBitmap() | |||
| newRoaring.Clear() | |||
| tfEncoder.Reset() | |||
| locEncoder.Reset() | |||
| lastDocNum = 0 | |||
| lastFreq = 0 | |||
| lastNorm = 0 | |||
| return nil | |||
| } | |||
| @@ -301,66 +291,39 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, | |||
| term, itrI, postingsOffset := enumerator.Current() | |||
| if !bytes.Equal(prevTerm, term) { | |||
| // check for the closure in meantime | |||
| if isClosed(closeCh) { | |||
| return nil, 0, seg.ErrClosed | |||
| } | |||
| // if the term changed, write out the info collected | |||
| // for the previous term | |||
| err2 := finishTerm(prevTerm) | |||
| if err2 != nil { | |||
| return nil, 0, err2 | |||
| err = finishTerm(prevTerm) | |||
| if err != nil { | |||
| return nil, 0, err | |||
| } | |||
| } | |||
| var err2 error | |||
| postings, err2 = dicts[itrI].postingsListFromOffset( | |||
| postings, err = dicts[itrI].postingsListFromOffset( | |||
| postingsOffset, drops[itrI], postings) | |||
| if err2 != nil { | |||
| return nil, 0, err2 | |||
| if err != nil { | |||
| return nil, 0, err | |||
| } | |||
| newDocNumsI := newDocNums[itrI] | |||
| postItr = postings.iterator(postItr) | |||
| next, err2 := postItr.Next() | |||
| for next != nil && err2 == nil { | |||
| hitNewDocNum := newDocNumsI[next.Number()] | |||
| if hitNewDocNum == docDropped { | |||
| return nil, 0, fmt.Errorf("see hit with dropped doc num") | |||
| } | |||
| newRoaring.Add(uint32(hitNewDocNum)) | |||
| // encode norm bits | |||
| norm := next.Norm() | |||
| normBits := math.Float32bits(float32(norm)) | |||
| err = tfEncoder.Add(hitNewDocNum, next.Frequency(), uint64(normBits)) | |||
| if err != nil { | |||
| return nil, 0, err | |||
| } | |||
| locs := next.Locations() | |||
| if len(locs) > 0 { | |||
| newRoaringLocs.Add(uint32(hitNewDocNum)) | |||
| for _, loc := range locs { | |||
| if cap(bufLoc) < 5+len(loc.ArrayPositions()) { | |||
| bufLoc = make([]uint64, 0, 5+len(loc.ArrayPositions())) | |||
| } | |||
| args := bufLoc[0:5] | |||
| args[0] = uint64(fieldsMap[loc.Field()] - 1) | |||
| args[1] = loc.Pos() | |||
| args[2] = loc.Start() | |||
| args[3] = loc.End() | |||
| args[4] = uint64(len(loc.ArrayPositions())) | |||
| args = append(args, loc.ArrayPositions()...) | |||
| err = locEncoder.Add(hitNewDocNum, args...) | |||
| if err != nil { | |||
| return nil, 0, err | |||
| } | |||
| } | |||
| } | |||
| docTermMap[hitNewDocNum] = | |||
| append(append(docTermMap[hitNewDocNum], term...), termSeparator) | |||
| next, err2 = postItr.Next() | |||
| postItr = postings.iterator(true, true, true, postItr) | |||
| if fieldsSame { | |||
| // can optimize by copying freq/norm/loc bytes directly | |||
| lastDocNum, lastFreq, lastNorm, err = mergeTermFreqNormLocsByCopying( | |||
| term, postItr, newDocNums[itrI], newRoaring, | |||
| tfEncoder, locEncoder) | |||
| } else { | |||
| lastDocNum, lastFreq, lastNorm, bufLoc, err = mergeTermFreqNormLocs( | |||
| fieldsMap, term, postItr, newDocNums[itrI], newRoaring, | |||
| tfEncoder, locEncoder, bufLoc) | |||
| } | |||
| if err2 != nil { | |||
| return nil, 0, err2 | |||
| if err != nil { | |||
| return nil, 0, err | |||
| } | |||
| prevTerm = prevTerm[:0] // copy to prevTerm in case Next() reuses term mem | |||
| @@ -368,7 +331,7 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, | |||
| err = enumerator.Next() | |||
| } | |||
| if err != nil && err != vellum.ErrIteratorDone { | |||
| if err != vellum.ErrIteratorDone { | |||
| return nil, 0, err | |||
| } | |||
| @@ -400,26 +363,63 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, | |||
| rv[fieldID] = dictOffset | |||
| // get the field doc value offset (start) | |||
| fieldDvLocsStart[fieldID] = uint64(w.Count()) | |||
| // update the field doc values | |||
| fdvEncoder := newChunkedContentCoder(uint64(chunkFactor), newSegDocCount-1) | |||
| for docNum, docTerms := range docTermMap { | |||
| if len(docTerms) > 0 { | |||
| err = fdvEncoder.Add(uint64(docNum), docTerms) | |||
| fdvEncoder := newChunkedContentCoder(uint64(chunkFactor), newSegDocCount-1, w, true) | |||
| fdvReadersAvailable := false | |||
| var dvIterClone *docValueReader | |||
| for segmentI, segment := range segmentsInFocus { | |||
| // check for the closure in meantime | |||
| if isClosed(closeCh) { | |||
| return nil, 0, seg.ErrClosed | |||
| } | |||
| fieldIDPlus1 := uint16(segment.fieldsMap[fieldName]) | |||
| if dvIter, exists := segment.fieldDvReaders[fieldIDPlus1-1]; exists && | |||
| dvIter != nil { | |||
| fdvReadersAvailable = true | |||
| dvIterClone = dvIter.cloneInto(dvIterClone) | |||
| err = dvIterClone.iterateAllDocValues(segment, func(docNum uint64, terms []byte) error { | |||
| if newDocNums[segmentI][docNum] == docDropped { | |||
| return nil | |||
| } | |||
| err := fdvEncoder.Add(newDocNums[segmentI][docNum], terms) | |||
| if err != nil { | |||
| return err | |||
| } | |||
| return nil | |||
| }) | |||
| if err != nil { | |||
| return nil, 0, err | |||
| } | |||
| } | |||
| } | |||
| err = fdvEncoder.Close() | |||
| if err != nil { | |||
| return nil, 0, err | |||
| } | |||
| // get the field doc value offset | |||
| fieldDvLocs[fieldID] = uint64(w.Count()) | |||
| if fdvReadersAvailable { | |||
| err = fdvEncoder.Close() | |||
| if err != nil { | |||
| return nil, 0, err | |||
| } | |||
| // persist the doc value details for this field | |||
| _, err = fdvEncoder.Write() | |||
| if err != nil { | |||
| return nil, 0, err | |||
| } | |||
| // get the field doc value offset (end) | |||
| fieldDvLocsEnd[fieldID] = uint64(w.Count()) | |||
| } else { | |||
| fieldDvLocsStart[fieldID] = fieldNotUninverted | |||
| fieldDvLocsEnd[fieldID] = fieldNotUninverted | |||
| } | |||
| // persist the doc value details for this field | |||
| _, err = fdvEncoder.Write(w) | |||
| // reset vellum buffer and vellum builder | |||
| vellumBuf.Reset() | |||
| err = newVellum.Reset(&vellumBuf) | |||
| if err != nil { | |||
| return nil, 0, err | |||
| } | |||
| @@ -428,38 +428,210 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, | |||
| fieldDvLocsOffset := uint64(w.Count()) | |||
| buf := bufMaxVarintLen64 | |||
| for _, offset := range fieldDvLocs { | |||
| n := binary.PutUvarint(buf, uint64(offset)) | |||
| for i := 0; i < len(fieldDvLocsStart); i++ { | |||
| n := binary.PutUvarint(buf, fieldDvLocsStart[i]) | |||
| _, err := w.Write(buf[:n]) | |||
| if err != nil { | |||
| return nil, 0, err | |||
| } | |||
| n = binary.PutUvarint(buf, fieldDvLocsEnd[i]) | |||
| _, err = w.Write(buf[:n]) | |||
| if err != nil { | |||
| return nil, 0, err | |||
| } | |||
| } | |||
| return rv, fieldDvLocsOffset, nil | |||
| } | |||
| func mergeTermFreqNormLocs(fieldsMap map[string]uint16, term []byte, postItr *PostingsIterator, | |||
| newDocNums []uint64, newRoaring *roaring.Bitmap, | |||
| tfEncoder *chunkedIntCoder, locEncoder *chunkedIntCoder, bufLoc []uint64) ( | |||
| lastDocNum uint64, lastFreq uint64, lastNorm uint64, bufLocOut []uint64, err error) { | |||
| next, err := postItr.Next() | |||
| for next != nil && err == nil { | |||
| hitNewDocNum := newDocNums[next.Number()] | |||
| if hitNewDocNum == docDropped { | |||
| return 0, 0, 0, nil, fmt.Errorf("see hit with dropped docNum") | |||
| } | |||
| newRoaring.Add(uint32(hitNewDocNum)) | |||
| nextFreq := next.Frequency() | |||
| nextNorm := uint64(math.Float32bits(float32(next.Norm()))) | |||
| locs := next.Locations() | |||
| err = tfEncoder.Add(hitNewDocNum, | |||
| encodeFreqHasLocs(nextFreq, len(locs) > 0), nextNorm) | |||
| if err != nil { | |||
| return 0, 0, 0, nil, err | |||
| } | |||
| if len(locs) > 0 { | |||
| numBytesLocs := 0 | |||
| for _, loc := range locs { | |||
| ap := loc.ArrayPositions() | |||
| numBytesLocs += totalUvarintBytes(uint64(fieldsMap[loc.Field()]-1), | |||
| loc.Pos(), loc.Start(), loc.End(), uint64(len(ap)), ap) | |||
| } | |||
| err = locEncoder.Add(hitNewDocNum, uint64(numBytesLocs)) | |||
| if err != nil { | |||
| return 0, 0, 0, nil, err | |||
| } | |||
| for _, loc := range locs { | |||
| ap := loc.ArrayPositions() | |||
| if cap(bufLoc) < 5+len(ap) { | |||
| bufLoc = make([]uint64, 0, 5+len(ap)) | |||
| } | |||
| args := bufLoc[0:5] | |||
| args[0] = uint64(fieldsMap[loc.Field()] - 1) | |||
| args[1] = loc.Pos() | |||
| args[2] = loc.Start() | |||
| args[3] = loc.End() | |||
| args[4] = uint64(len(ap)) | |||
| args = append(args, ap...) | |||
| err = locEncoder.Add(hitNewDocNum, args...) | |||
| if err != nil { | |||
| return 0, 0, 0, nil, err | |||
| } | |||
| } | |||
| } | |||
| lastDocNum = hitNewDocNum | |||
| lastFreq = nextFreq | |||
| lastNorm = nextNorm | |||
| next, err = postItr.Next() | |||
| } | |||
| return lastDocNum, lastFreq, lastNorm, bufLoc, err | |||
| } | |||
| func mergeTermFreqNormLocsByCopying(term []byte, postItr *PostingsIterator, | |||
| newDocNums []uint64, newRoaring *roaring.Bitmap, | |||
| tfEncoder *chunkedIntCoder, locEncoder *chunkedIntCoder) ( | |||
| lastDocNum uint64, lastFreq uint64, lastNorm uint64, err error) { | |||
| nextDocNum, nextFreq, nextNorm, nextFreqNormBytes, nextLocBytes, err := | |||
| postItr.nextBytes() | |||
| for err == nil && len(nextFreqNormBytes) > 0 { | |||
| hitNewDocNum := newDocNums[nextDocNum] | |||
| if hitNewDocNum == docDropped { | |||
| return 0, 0, 0, fmt.Errorf("see hit with dropped doc num") | |||
| } | |||
| newRoaring.Add(uint32(hitNewDocNum)) | |||
| err = tfEncoder.AddBytes(hitNewDocNum, nextFreqNormBytes) | |||
| if err != nil { | |||
| return 0, 0, 0, err | |||
| } | |||
| if len(nextLocBytes) > 0 { | |||
| err = locEncoder.AddBytes(hitNewDocNum, nextLocBytes) | |||
| if err != nil { | |||
| return 0, 0, 0, err | |||
| } | |||
| } | |||
| lastDocNum = hitNewDocNum | |||
| lastFreq = nextFreq | |||
| lastNorm = nextNorm | |||
| nextDocNum, nextFreq, nextNorm, nextFreqNormBytes, nextLocBytes, err = | |||
| postItr.nextBytes() | |||
| } | |||
| return lastDocNum, lastFreq, lastNorm, err | |||
| } | |||
| func writePostings(postings *roaring.Bitmap, tfEncoder, locEncoder *chunkedIntCoder, | |||
| use1HitEncoding func(uint64) (bool, uint64, uint64), | |||
| w *CountHashWriter, bufMaxVarintLen64 []byte) ( | |||
| offset uint64, err error) { | |||
| termCardinality := postings.GetCardinality() | |||
| if termCardinality <= 0 { | |||
| return 0, nil | |||
| } | |||
| if use1HitEncoding != nil { | |||
| encodeAs1Hit, docNum1Hit, normBits1Hit := use1HitEncoding(termCardinality) | |||
| if encodeAs1Hit { | |||
| return FSTValEncode1Hit(docNum1Hit, normBits1Hit), nil | |||
| } | |||
| } | |||
| tfOffset := uint64(w.Count()) | |||
| _, err = tfEncoder.Write(w) | |||
| if err != nil { | |||
| return 0, err | |||
| } | |||
| locOffset := uint64(w.Count()) | |||
| _, err = locEncoder.Write(w) | |||
| if err != nil { | |||
| return 0, err | |||
| } | |||
| postingsOffset := uint64(w.Count()) | |||
| n := binary.PutUvarint(bufMaxVarintLen64, tfOffset) | |||
| _, err = w.Write(bufMaxVarintLen64[:n]) | |||
| if err != nil { | |||
| return 0, err | |||
| } | |||
| n = binary.PutUvarint(bufMaxVarintLen64, locOffset) | |||
| _, err = w.Write(bufMaxVarintLen64[:n]) | |||
| if err != nil { | |||
| return 0, err | |||
| } | |||
| _, err = writeRoaringWithLen(postings, w, bufMaxVarintLen64) | |||
| if err != nil { | |||
| return 0, err | |||
| } | |||
| return postingsOffset, nil | |||
| } | |||
| type varintEncoder func(uint64) (int, error) | |||
| func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap, | |||
| fieldsMap map[string]uint16, fieldsInv []string, fieldsSame bool, newSegDocCount uint64, | |||
| w *CountHashWriter) (uint64, [][]uint64, error) { | |||
| w *CountHashWriter, closeCh chan struct{}) (uint64, [][]uint64, error) { | |||
| var rv [][]uint64 // The remapped or newDocNums for each segment. | |||
| var newDocNum uint64 | |||
| var curr int | |||
| var metaBuf bytes.Buffer | |||
| var data, compressed []byte | |||
| metaEncoder := govarint.NewU64Base128Encoder(&metaBuf) | |||
| var metaBuf bytes.Buffer | |||
| varBuf := make([]byte, binary.MaxVarintLen64) | |||
| metaEncode := func(val uint64) (int, error) { | |||
| wb := binary.PutUvarint(varBuf, val) | |||
| return metaBuf.Write(varBuf[:wb]) | |||
| } | |||
| vals := make([][][]byte, len(fieldsInv)) | |||
| typs := make([][]byte, len(fieldsInv)) | |||
| poss := make([][][]uint64, len(fieldsInv)) | |||
| var posBuf []uint64 | |||
| docNumOffsets := make([]uint64, newSegDocCount) | |||
| vdc := visitDocumentCtxPool.Get().(*visitDocumentCtx) | |||
| defer visitDocumentCtxPool.Put(vdc) | |||
| // for each segment | |||
| for segI, segment := range segments { | |||
| // check for the closure in meantime | |||
| if isClosed(closeCh) { | |||
| return 0, nil, seg.ErrClosed | |||
| } | |||
| segNewDocNums := make([]uint64, segment.numDocs) | |||
| dropsI := drops[segI] | |||
| @@ -495,7 +667,8 @@ func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap, | |||
| curr = 0 | |||
| metaBuf.Reset() | |||
| data = data[:0] | |||
| compressed = compressed[:0] | |||
| posTemp := posBuf | |||
| // collect all the data | |||
| for i := 0; i < len(fieldsInv); i++ { | |||
| @@ -503,42 +676,63 @@ func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap, | |||
| typs[i] = typs[i][:0] | |||
| poss[i] = poss[i][:0] | |||
| } | |||
| err := segment.VisitDocument(docNum, func(field string, typ byte, value []byte, pos []uint64) bool { | |||
| err := segment.visitDocument(vdc, docNum, func(field string, typ byte, value []byte, pos []uint64) bool { | |||
| fieldID := int(fieldsMap[field]) - 1 | |||
| vals[fieldID] = append(vals[fieldID], value) | |||
| typs[fieldID] = append(typs[fieldID], typ) | |||
| poss[fieldID] = append(poss[fieldID], pos) | |||
| // copy array positions to preserve them beyond the scope of this callback | |||
| var curPos []uint64 | |||
| if len(pos) > 0 { | |||
| if cap(posTemp) < len(pos) { | |||
| posBuf = make([]uint64, len(pos)*len(fieldsInv)) | |||
| posTemp = posBuf | |||
| } | |||
| curPos = posTemp[0:len(pos)] | |||
| copy(curPos, pos) | |||
| posTemp = posTemp[len(pos):] | |||
| } | |||
| poss[fieldID] = append(poss[fieldID], curPos) | |||
| return true | |||
| }) | |||
| if err != nil { | |||
| return 0, nil, err | |||
| } | |||
| // now walk the fields in order | |||
| for fieldID := range fieldsInv { | |||
| storedFieldValues := vals[int(fieldID)] | |||
| // _id field special case optimizes ExternalID() lookups | |||
| idFieldVal := vals[uint16(0)][0] | |||
| _, err = metaEncode(uint64(len(idFieldVal))) | |||
| if err != nil { | |||
| return 0, nil, err | |||
| } | |||
| // now walk the non-"_id" fields in order | |||
| for fieldID := 1; fieldID < len(fieldsInv); fieldID++ { | |||
| storedFieldValues := vals[fieldID] | |||
| stf := typs[int(fieldID)] | |||
| spf := poss[int(fieldID)] | |||
| stf := typs[fieldID] | |||
| spf := poss[fieldID] | |||
| var err2 error | |||
| curr, data, err2 = persistStoredFieldValues(fieldID, | |||
| storedFieldValues, stf, spf, curr, metaEncoder, data) | |||
| storedFieldValues, stf, spf, curr, metaEncode, data) | |||
| if err2 != nil { | |||
| return 0, nil, err2 | |||
| } | |||
| } | |||
| metaEncoder.Close() | |||
| metaBytes := metaBuf.Bytes() | |||
| compressed = snappy.Encode(compressed, data) | |||
| compressed = snappy.Encode(compressed[:cap(compressed)], data) | |||
| // record where we're about to start writing | |||
| docNumOffsets[newDocNum] = uint64(w.Count()) | |||
| // write out the meta len and compressed data len | |||
| _, err = writeUvarints(w, uint64(len(metaBytes)), uint64(len(compressed))) | |||
| _, err = writeUvarints(w, | |||
| uint64(len(metaBytes)), | |||
| uint64(len(idFieldVal)+len(compressed))) | |||
| if err != nil { | |||
| return 0, nil, err | |||
| } | |||
| @@ -547,6 +741,11 @@ func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap, | |||
| if err != nil { | |||
| return 0, nil, err | |||
| } | |||
| // now write the _id field val (counted as part of the 'compressed' data) | |||
| _, err = w.Write(idFieldVal) | |||
| if err != nil { | |||
| return 0, nil, err | |||
| } | |||
| // now write the compressed data | |||
| _, err = w.Write(compressed) | |||
| if err != nil { | |||
| @@ -644,3 +843,12 @@ func mergeFields(segments []*SegmentBase) (bool, []string) { | |||
| return fieldsSame, rv | |||
| } | |||
| func isClosed(closeCh chan struct{}) bool { | |||
| select { | |||
| case <-closeCh: | |||
| return true | |||
| default: | |||
| return false | |||
| } | |||
| } | |||
| @@ -0,0 +1,826 @@ | |||
| // Copyright (c) 2018 Couchbase, Inc. | |||
| // | |||
| // Licensed under the Apache License, Version 2.0 (the "License"); | |||
| // you may not use this file except in compliance with the License. | |||
| // You may obtain a copy of the License at | |||
| // | |||
| // http://www.apache.org/licenses/LICENSE-2.0 | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software | |||
| // distributed under the License is distributed on an "AS IS" BASIS, | |||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| // See the License for the specific language governing permissions and | |||
| // limitations under the License. | |||
| package zap | |||
| import ( | |||
| "bytes" | |||
| "encoding/binary" | |||
| "math" | |||
| "sort" | |||
| "sync" | |||
| "github.com/RoaringBitmap/roaring" | |||
| "github.com/blevesearch/bleve/analysis" | |||
| "github.com/blevesearch/bleve/document" | |||
| "github.com/blevesearch/bleve/index" | |||
| "github.com/couchbase/vellum" | |||
| "github.com/golang/snappy" | |||
| ) | |||
| var NewSegmentBufferNumResultsBump int = 100 | |||
| var NewSegmentBufferNumResultsFactor float64 = 1.0 | |||
| var NewSegmentBufferAvgBytesPerDocFactor float64 = 1.0 | |||
| // AnalysisResultsToSegmentBase produces an in-memory zap-encoded | |||
| // SegmentBase from analysis results | |||
| func AnalysisResultsToSegmentBase(results []*index.AnalysisResult, | |||
| chunkFactor uint32) (*SegmentBase, uint64, error) { | |||
| s := interimPool.Get().(*interim) | |||
| var br bytes.Buffer | |||
| if s.lastNumDocs > 0 { | |||
| // use previous results to initialize the buf with an estimate | |||
| // size, but note that the interim instance comes from a | |||
| // global interimPool, so multiple scorch instances indexing | |||
| // different docs can lead to low quality estimates | |||
| estimateAvgBytesPerDoc := int(float64(s.lastOutSize/s.lastNumDocs) * | |||
| NewSegmentBufferNumResultsFactor) | |||
| estimateNumResults := int(float64(len(results)+NewSegmentBufferNumResultsBump) * | |||
| NewSegmentBufferAvgBytesPerDocFactor) | |||
| br.Grow(estimateAvgBytesPerDoc * estimateNumResults) | |||
| } | |||
| s.results = results | |||
| s.chunkFactor = chunkFactor | |||
| s.w = NewCountHashWriter(&br) | |||
| storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets, | |||
| err := s.convert() | |||
| if err != nil { | |||
| return nil, uint64(0), err | |||
| } | |||
| sb, err := InitSegmentBase(br.Bytes(), s.w.Sum32(), chunkFactor, | |||
| s.FieldsMap, s.FieldsInv, uint64(len(results)), | |||
| storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets) | |||
| if err == nil && s.reset() == nil { | |||
| s.lastNumDocs = len(results) | |||
| s.lastOutSize = len(br.Bytes()) | |||
| interimPool.Put(s) | |||
| } | |||
| return sb, uint64(len(br.Bytes())), err | |||
| } | |||
| var interimPool = sync.Pool{New: func() interface{} { return &interim{} }} | |||
| // interim holds temporary working data used while converting from | |||
| // analysis results to a zap-encoded segment | |||
| type interim struct { | |||
| results []*index.AnalysisResult | |||
| chunkFactor uint32 | |||
| w *CountHashWriter | |||
| // FieldsMap adds 1 to field id to avoid zero value issues | |||
| // name -> field id + 1 | |||
| FieldsMap map[string]uint16 | |||
| // FieldsInv is the inverse of FieldsMap | |||
| // field id -> name | |||
| FieldsInv []string | |||
| // Term dictionaries for each field | |||
| // field id -> term -> postings list id + 1 | |||
| Dicts []map[string]uint64 | |||
| // Terms for each field, where terms are sorted ascending | |||
| // field id -> []term | |||
| DictKeys [][]string | |||
| // Fields whose IncludeDocValues is true | |||
| // field id -> bool | |||
| IncludeDocValues []bool | |||
| // postings id -> bitmap of docNums | |||
| Postings []*roaring.Bitmap | |||
| // postings id -> freq/norm's, one for each docNum in postings | |||
| FreqNorms [][]interimFreqNorm | |||
| freqNormsBacking []interimFreqNorm | |||
| // postings id -> locs, one for each freq | |||
| Locs [][]interimLoc | |||
| locsBacking []interimLoc | |||
| numTermsPerPostingsList []int // key is postings list id | |||
| numLocsPerPostingsList []int // key is postings list id | |||
| builder *vellum.Builder | |||
| builderBuf bytes.Buffer | |||
| metaBuf bytes.Buffer | |||
| tmp0 []byte | |||
| tmp1 []byte | |||
| lastNumDocs int | |||
| lastOutSize int | |||
| } | |||
| func (s *interim) reset() (err error) { | |||
| s.results = nil | |||
| s.chunkFactor = 0 | |||
| s.w = nil | |||
| s.FieldsMap = nil | |||
| s.FieldsInv = nil | |||
| for i := range s.Dicts { | |||
| s.Dicts[i] = nil | |||
| } | |||
| s.Dicts = s.Dicts[:0] | |||
| for i := range s.DictKeys { | |||
| s.DictKeys[i] = s.DictKeys[i][:0] | |||
| } | |||
| s.DictKeys = s.DictKeys[:0] | |||
| for i := range s.IncludeDocValues { | |||
| s.IncludeDocValues[i] = false | |||
| } | |||
| s.IncludeDocValues = s.IncludeDocValues[:0] | |||
| for _, idn := range s.Postings { | |||
| idn.Clear() | |||
| } | |||
| s.Postings = s.Postings[:0] | |||
| s.FreqNorms = s.FreqNorms[:0] | |||
| for i := range s.freqNormsBacking { | |||
| s.freqNormsBacking[i] = interimFreqNorm{} | |||
| } | |||
| s.freqNormsBacking = s.freqNormsBacking[:0] | |||
| s.Locs = s.Locs[:0] | |||
| for i := range s.locsBacking { | |||
| s.locsBacking[i] = interimLoc{} | |||
| } | |||
| s.locsBacking = s.locsBacking[:0] | |||
| s.numTermsPerPostingsList = s.numTermsPerPostingsList[:0] | |||
| s.numLocsPerPostingsList = s.numLocsPerPostingsList[:0] | |||
| s.builderBuf.Reset() | |||
| if s.builder != nil { | |||
| err = s.builder.Reset(&s.builderBuf) | |||
| } | |||
| s.metaBuf.Reset() | |||
| s.tmp0 = s.tmp0[:0] | |||
| s.tmp1 = s.tmp1[:0] | |||
| s.lastNumDocs = 0 | |||
| s.lastOutSize = 0 | |||
| return err | |||
| } | |||
| func (s *interim) grabBuf(size int) []byte { | |||
| buf := s.tmp0 | |||
| if cap(buf) < size { | |||
| buf = make([]byte, size) | |||
| s.tmp0 = buf | |||
| } | |||
| return buf[0:size] | |||
| } | |||
| type interimStoredField struct { | |||
| vals [][]byte | |||
| typs []byte | |||
| arrayposs [][]uint64 // array positions | |||
| } | |||
| type interimFreqNorm struct { | |||
| freq uint64 | |||
| norm float32 | |||
| numLocs int | |||
| } | |||
| type interimLoc struct { | |||
| fieldID uint16 | |||
| pos uint64 | |||
| start uint64 | |||
| end uint64 | |||
| arrayposs []uint64 | |||
| } | |||
| func (s *interim) convert() (uint64, uint64, uint64, []uint64, error) { | |||
| s.FieldsMap = map[string]uint16{} | |||
| s.getOrDefineField("_id") // _id field is fieldID 0 | |||
| for _, result := range s.results { | |||
| for _, field := range result.Document.CompositeFields { | |||
| s.getOrDefineField(field.Name()) | |||
| } | |||
| for _, field := range result.Document.Fields { | |||
| s.getOrDefineField(field.Name()) | |||
| } | |||
| } | |||
| sort.Strings(s.FieldsInv[1:]) // keep _id as first field | |||
| for fieldID, fieldName := range s.FieldsInv { | |||
| s.FieldsMap[fieldName] = uint16(fieldID + 1) | |||
| } | |||
| if cap(s.IncludeDocValues) >= len(s.FieldsInv) { | |||
| s.IncludeDocValues = s.IncludeDocValues[:len(s.FieldsInv)] | |||
| } else { | |||
| s.IncludeDocValues = make([]bool, len(s.FieldsInv)) | |||
| } | |||
| s.prepareDicts() | |||
| for _, dict := range s.DictKeys { | |||
| sort.Strings(dict) | |||
| } | |||
| s.processDocuments() | |||
| storedIndexOffset, err := s.writeStoredFields() | |||
| if err != nil { | |||
| return 0, 0, 0, nil, err | |||
| } | |||
| var fdvIndexOffset uint64 | |||
| var dictOffsets []uint64 | |||
| if len(s.results) > 0 { | |||
| fdvIndexOffset, dictOffsets, err = s.writeDicts() | |||
| if err != nil { | |||
| return 0, 0, 0, nil, err | |||
| } | |||
| } else { | |||
| dictOffsets = make([]uint64, len(s.FieldsInv)) | |||
| } | |||
| fieldsIndexOffset, err := persistFields(s.FieldsInv, s.w, dictOffsets) | |||
| if err != nil { | |||
| return 0, 0, 0, nil, err | |||
| } | |||
| return storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets, nil | |||
| } | |||
| func (s *interim) getOrDefineField(fieldName string) int { | |||
| fieldIDPlus1, exists := s.FieldsMap[fieldName] | |||
| if !exists { | |||
| fieldIDPlus1 = uint16(len(s.FieldsInv) + 1) | |||
| s.FieldsMap[fieldName] = fieldIDPlus1 | |||
| s.FieldsInv = append(s.FieldsInv, fieldName) | |||
| s.Dicts = append(s.Dicts, make(map[string]uint64)) | |||
| n := len(s.DictKeys) | |||
| if n < cap(s.DictKeys) { | |||
| s.DictKeys = s.DictKeys[:n+1] | |||
| s.DictKeys[n] = s.DictKeys[n][:0] | |||
| } else { | |||
| s.DictKeys = append(s.DictKeys, []string(nil)) | |||
| } | |||
| } | |||
| return int(fieldIDPlus1 - 1) | |||
| } | |||
| // fill Dicts and DictKeys from analysis results | |||
| func (s *interim) prepareDicts() { | |||
| var pidNext int | |||
| var totTFs int | |||
| var totLocs int | |||
| visitField := func(fieldID uint16, tfs analysis.TokenFrequencies) { | |||
| dict := s.Dicts[fieldID] | |||
| dictKeys := s.DictKeys[fieldID] | |||
| for term, tf := range tfs { | |||
| pidPlus1, exists := dict[term] | |||
| if !exists { | |||
| pidNext++ | |||
| pidPlus1 = uint64(pidNext) | |||
| dict[term] = pidPlus1 | |||
| dictKeys = append(dictKeys, term) | |||
| s.numTermsPerPostingsList = append(s.numTermsPerPostingsList, 0) | |||
| s.numLocsPerPostingsList = append(s.numLocsPerPostingsList, 0) | |||
| } | |||
| pid := pidPlus1 - 1 | |||
| s.numTermsPerPostingsList[pid] += 1 | |||
| s.numLocsPerPostingsList[pid] += len(tf.Locations) | |||
| totLocs += len(tf.Locations) | |||
| } | |||
| totTFs += len(tfs) | |||
| s.DictKeys[fieldID] = dictKeys | |||
| } | |||
| for _, result := range s.results { | |||
| // walk each composite field | |||
| for _, field := range result.Document.CompositeFields { | |||
| fieldID := uint16(s.getOrDefineField(field.Name())) | |||
| _, tf := field.Analyze() | |||
| visitField(fieldID, tf) | |||
| } | |||
| // walk each field | |||
| for i, field := range result.Document.Fields { | |||
| fieldID := uint16(s.getOrDefineField(field.Name())) | |||
| tf := result.Analyzed[i] | |||
| visitField(fieldID, tf) | |||
| } | |||
| } | |||
| numPostingsLists := pidNext | |||
| if cap(s.Postings) >= numPostingsLists { | |||
| s.Postings = s.Postings[:numPostingsLists] | |||
| } else { | |||
| postings := make([]*roaring.Bitmap, numPostingsLists) | |||
| copy(postings, s.Postings[:cap(s.Postings)]) | |||
| for i := 0; i < numPostingsLists; i++ { | |||
| if postings[i] == nil { | |||
| postings[i] = roaring.New() | |||
| } | |||
| } | |||
| s.Postings = postings | |||
| } | |||
| if cap(s.FreqNorms) >= numPostingsLists { | |||
| s.FreqNorms = s.FreqNorms[:numPostingsLists] | |||
| } else { | |||
| s.FreqNorms = make([][]interimFreqNorm, numPostingsLists) | |||
| } | |||
| if cap(s.freqNormsBacking) >= totTFs { | |||
| s.freqNormsBacking = s.freqNormsBacking[:totTFs] | |||
| } else { | |||
| s.freqNormsBacking = make([]interimFreqNorm, totTFs) | |||
| } | |||
| freqNormsBacking := s.freqNormsBacking | |||
| for pid, numTerms := range s.numTermsPerPostingsList { | |||
| s.FreqNorms[pid] = freqNormsBacking[0:0] | |||
| freqNormsBacking = freqNormsBacking[numTerms:] | |||
| } | |||
| if cap(s.Locs) >= numPostingsLists { | |||
| s.Locs = s.Locs[:numPostingsLists] | |||
| } else { | |||
| s.Locs = make([][]interimLoc, numPostingsLists) | |||
| } | |||
| if cap(s.locsBacking) >= totLocs { | |||
| s.locsBacking = s.locsBacking[:totLocs] | |||
| } else { | |||
| s.locsBacking = make([]interimLoc, totLocs) | |||
| } | |||
| locsBacking := s.locsBacking | |||
| for pid, numLocs := range s.numLocsPerPostingsList { | |||
| s.Locs[pid] = locsBacking[0:0] | |||
| locsBacking = locsBacking[numLocs:] | |||
| } | |||
| } | |||
| func (s *interim) processDocuments() { | |||
| numFields := len(s.FieldsInv) | |||
| reuseFieldLens := make([]int, numFields) | |||
| reuseFieldTFs := make([]analysis.TokenFrequencies, numFields) | |||
| for docNum, result := range s.results { | |||
| for i := 0; i < numFields; i++ { // clear these for reuse | |||
| reuseFieldLens[i] = 0 | |||
| reuseFieldTFs[i] = nil | |||
| } | |||
| s.processDocument(uint64(docNum), result, | |||
| reuseFieldLens, reuseFieldTFs) | |||
| } | |||
| } | |||
| func (s *interim) processDocument(docNum uint64, | |||
| result *index.AnalysisResult, | |||
| fieldLens []int, fieldTFs []analysis.TokenFrequencies) { | |||
| visitField := func(fieldID uint16, fieldName string, | |||
| ln int, tf analysis.TokenFrequencies) { | |||
| fieldLens[fieldID] += ln | |||
| existingFreqs := fieldTFs[fieldID] | |||
| if existingFreqs != nil { | |||
| existingFreqs.MergeAll(fieldName, tf) | |||
| } else { | |||
| fieldTFs[fieldID] = tf | |||
| } | |||
| } | |||
| // walk each composite field | |||
| for _, field := range result.Document.CompositeFields { | |||
| fieldID := uint16(s.getOrDefineField(field.Name())) | |||
| ln, tf := field.Analyze() | |||
| visitField(fieldID, field.Name(), ln, tf) | |||
| } | |||
| // walk each field | |||
| for i, field := range result.Document.Fields { | |||
| fieldID := uint16(s.getOrDefineField(field.Name())) | |||
| ln := result.Length[i] | |||
| tf := result.Analyzed[i] | |||
| visitField(fieldID, field.Name(), ln, tf) | |||
| } | |||
| // now that it's been rolled up into fieldTFs, walk that | |||
| for fieldID, tfs := range fieldTFs { | |||
| dict := s.Dicts[fieldID] | |||
| norm := float32(1.0 / math.Sqrt(float64(fieldLens[fieldID]))) | |||
| for term, tf := range tfs { | |||
| pid := dict[term] - 1 | |||
| bs := s.Postings[pid] | |||
| bs.Add(uint32(docNum)) | |||
| s.FreqNorms[pid] = append(s.FreqNorms[pid], | |||
| interimFreqNorm{ | |||
| freq: uint64(tf.Frequency()), | |||
| norm: norm, | |||
| numLocs: len(tf.Locations), | |||
| }) | |||
| if len(tf.Locations) > 0 { | |||
| locs := s.Locs[pid] | |||
| for _, loc := range tf.Locations { | |||
| var locf = uint16(fieldID) | |||
| if loc.Field != "" { | |||
| locf = uint16(s.getOrDefineField(loc.Field)) | |||
| } | |||
| var arrayposs []uint64 | |||
| if len(loc.ArrayPositions) > 0 { | |||
| arrayposs = loc.ArrayPositions | |||
| } | |||
| locs = append(locs, interimLoc{ | |||
| fieldID: locf, | |||
| pos: uint64(loc.Position), | |||
| start: uint64(loc.Start), | |||
| end: uint64(loc.End), | |||
| arrayposs: arrayposs, | |||
| }) | |||
| } | |||
| s.Locs[pid] = locs | |||
| } | |||
| } | |||
| } | |||
| } | |||
| func (s *interim) writeStoredFields() ( | |||
| storedIndexOffset uint64, err error) { | |||
| varBuf := make([]byte, binary.MaxVarintLen64) | |||
| metaEncode := func(val uint64) (int, error) { | |||
| wb := binary.PutUvarint(varBuf, val) | |||
| return s.metaBuf.Write(varBuf[:wb]) | |||
| } | |||
| data, compressed := s.tmp0[:0], s.tmp1[:0] | |||
| defer func() { s.tmp0, s.tmp1 = data, compressed }() | |||
| // keyed by docNum | |||
| docStoredOffsets := make([]uint64, len(s.results)) | |||
| // keyed by fieldID, for the current doc in the loop | |||
| docStoredFields := map[uint16]interimStoredField{} | |||
| for docNum, result := range s.results { | |||
| for fieldID := range docStoredFields { // reset for next doc | |||
| delete(docStoredFields, fieldID) | |||
| } | |||
| for _, field := range result.Document.Fields { | |||
| fieldID := uint16(s.getOrDefineField(field.Name())) | |||
| opts := field.Options() | |||
| if opts.IsStored() { | |||
| isf := docStoredFields[fieldID] | |||
| isf.vals = append(isf.vals, field.Value()) | |||
| isf.typs = append(isf.typs, encodeFieldType(field)) | |||
| isf.arrayposs = append(isf.arrayposs, field.ArrayPositions()) | |||
| docStoredFields[fieldID] = isf | |||
| } | |||
| if opts.IncludeDocValues() { | |||
| s.IncludeDocValues[fieldID] = true | |||
| } | |||
| } | |||
| var curr int | |||
| s.metaBuf.Reset() | |||
| data = data[:0] | |||
| // _id field special case optimizes ExternalID() lookups | |||
| idFieldVal := docStoredFields[uint16(0)].vals[0] | |||
| _, err = metaEncode(uint64(len(idFieldVal))) | |||
| if err != nil { | |||
| return 0, err | |||
| } | |||
| // handle non-"_id" fields | |||
| for fieldID := 1; fieldID < len(s.FieldsInv); fieldID++ { | |||
| isf, exists := docStoredFields[uint16(fieldID)] | |||
| if exists { | |||
| curr, data, err = persistStoredFieldValues( | |||
| fieldID, isf.vals, isf.typs, isf.arrayposs, | |||
| curr, metaEncode, data) | |||
| if err != nil { | |||
| return 0, err | |||
| } | |||
| } | |||
| } | |||
| metaBytes := s.metaBuf.Bytes() | |||
| compressed = snappy.Encode(compressed[:cap(compressed)], data) | |||
| docStoredOffsets[docNum] = uint64(s.w.Count()) | |||
| _, err := writeUvarints(s.w, | |||
| uint64(len(metaBytes)), | |||
| uint64(len(idFieldVal)+len(compressed))) | |||
| if err != nil { | |||
| return 0, err | |||
| } | |||
| _, err = s.w.Write(metaBytes) | |||
| if err != nil { | |||
| return 0, err | |||
| } | |||
| _, err = s.w.Write(idFieldVal) | |||
| if err != nil { | |||
| return 0, err | |||
| } | |||
| _, err = s.w.Write(compressed) | |||
| if err != nil { | |||
| return 0, err | |||
| } | |||
| } | |||
| storedIndexOffset = uint64(s.w.Count()) | |||
| for _, docStoredOffset := range docStoredOffsets { | |||
| err = binary.Write(s.w, binary.BigEndian, docStoredOffset) | |||
| if err != nil { | |||
| return 0, err | |||
| } | |||
| } | |||
| return storedIndexOffset, nil | |||
| } | |||
| func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err error) { | |||
| dictOffsets = make([]uint64, len(s.FieldsInv)) | |||
| fdvOffsetsStart := make([]uint64, len(s.FieldsInv)) | |||
| fdvOffsetsEnd := make([]uint64, len(s.FieldsInv)) | |||
| buf := s.grabBuf(binary.MaxVarintLen64) | |||
| tfEncoder := newChunkedIntCoder(uint64(s.chunkFactor), uint64(len(s.results)-1)) | |||
| locEncoder := newChunkedIntCoder(uint64(s.chunkFactor), uint64(len(s.results)-1)) | |||
| fdvEncoder := newChunkedContentCoder(uint64(s.chunkFactor), uint64(len(s.results)-1), s.w, false) | |||
| var docTermMap [][]byte | |||
| if s.builder == nil { | |||
| s.builder, err = vellum.New(&s.builderBuf, nil) | |||
| if err != nil { | |||
| return 0, nil, err | |||
| } | |||
| } | |||
| for fieldID, terms := range s.DictKeys { | |||
| if cap(docTermMap) < len(s.results) { | |||
| docTermMap = make([][]byte, len(s.results)) | |||
| } else { | |||
| docTermMap = docTermMap[0:len(s.results)] | |||
| for docNum := range docTermMap { // reset the docTermMap | |||
| docTermMap[docNum] = docTermMap[docNum][:0] | |||
| } | |||
| } | |||
| dict := s.Dicts[fieldID] | |||
| for _, term := range terms { // terms are already sorted | |||
| pid := dict[term] - 1 | |||
| postingsBS := s.Postings[pid] | |||
| freqNorms := s.FreqNorms[pid] | |||
| freqNormOffset := 0 | |||
| locs := s.Locs[pid] | |||
| locOffset := 0 | |||
| postingsItr := postingsBS.Iterator() | |||
| for postingsItr.HasNext() { | |||
| docNum := uint64(postingsItr.Next()) | |||
| freqNorm := freqNorms[freqNormOffset] | |||
| err = tfEncoder.Add(docNum, | |||
| encodeFreqHasLocs(freqNorm.freq, freqNorm.numLocs > 0), | |||
| uint64(math.Float32bits(freqNorm.norm))) | |||
| if err != nil { | |||
| return 0, nil, err | |||
| } | |||
| if freqNorm.numLocs > 0 { | |||
| numBytesLocs := 0 | |||
| for _, loc := range locs[locOffset : locOffset+freqNorm.numLocs] { | |||
| numBytesLocs += totalUvarintBytes( | |||
| uint64(loc.fieldID), loc.pos, loc.start, loc.end, | |||
| uint64(len(loc.arrayposs)), loc.arrayposs) | |||
| } | |||
| err = locEncoder.Add(docNum, uint64(numBytesLocs)) | |||
| if err != nil { | |||
| return 0, nil, err | |||
| } | |||
| for _, loc := range locs[locOffset : locOffset+freqNorm.numLocs] { | |||
| err = locEncoder.Add(docNum, | |||
| uint64(loc.fieldID), loc.pos, loc.start, loc.end, | |||
| uint64(len(loc.arrayposs))) | |||
| if err != nil { | |||
| return 0, nil, err | |||
| } | |||
| err = locEncoder.Add(docNum, loc.arrayposs...) | |||
| if err != nil { | |||
| return 0, nil, err | |||
| } | |||
| } | |||
| locOffset += freqNorm.numLocs | |||
| } | |||
| freqNormOffset++ | |||
| docTermMap[docNum] = append( | |||
| append(docTermMap[docNum], term...), | |||
| termSeparator) | |||
| } | |||
| tfEncoder.Close() | |||
| locEncoder.Close() | |||
| postingsOffset, err := | |||
| writePostings(postingsBS, tfEncoder, locEncoder, nil, s.w, buf) | |||
| if err != nil { | |||
| return 0, nil, err | |||
| } | |||
| if postingsOffset > uint64(0) { | |||
| err = s.builder.Insert([]byte(term), postingsOffset) | |||
| if err != nil { | |||
| return 0, nil, err | |||
| } | |||
| } | |||
| tfEncoder.Reset() | |||
| locEncoder.Reset() | |||
| } | |||
| err = s.builder.Close() | |||
| if err != nil { | |||
| return 0, nil, err | |||
| } | |||
| // record where this dictionary starts | |||
| dictOffsets[fieldID] = uint64(s.w.Count()) | |||
| vellumData := s.builderBuf.Bytes() | |||
| // write out the length of the vellum data | |||
| n := binary.PutUvarint(buf, uint64(len(vellumData))) | |||
| _, err = s.w.Write(buf[:n]) | |||
| if err != nil { | |||
| return 0, nil, err | |||
| } | |||
| // write this vellum to disk | |||
| _, err = s.w.Write(vellumData) | |||
| if err != nil { | |||
| return 0, nil, err | |||
| } | |||
| // reset vellum for reuse | |||
| s.builderBuf.Reset() | |||
| err = s.builder.Reset(&s.builderBuf) | |||
| if err != nil { | |||
| return 0, nil, err | |||
| } | |||
| // write the field doc values | |||
| if s.IncludeDocValues[fieldID] { | |||
| for docNum, docTerms := range docTermMap { | |||
| if len(docTerms) > 0 { | |||
| err = fdvEncoder.Add(uint64(docNum), docTerms) | |||
| if err != nil { | |||
| return 0, nil, err | |||
| } | |||
| } | |||
| } | |||
| err = fdvEncoder.Close() | |||
| if err != nil { | |||
| return 0, nil, err | |||
| } | |||
| fdvOffsetsStart[fieldID] = uint64(s.w.Count()) | |||
| _, err = fdvEncoder.Write() | |||
| if err != nil { | |||
| return 0, nil, err | |||
| } | |||
| fdvOffsetsEnd[fieldID] = uint64(s.w.Count()) | |||
| fdvEncoder.Reset() | |||
| } else { | |||
| fdvOffsetsStart[fieldID] = fieldNotUninverted | |||
| fdvOffsetsEnd[fieldID] = fieldNotUninverted | |||
| } | |||
| } | |||
| fdvIndexOffset = uint64(s.w.Count()) | |||
| for i := 0; i < len(fdvOffsetsStart); i++ { | |||
| n := binary.PutUvarint(buf, fdvOffsetsStart[i]) | |||
| _, err := s.w.Write(buf[:n]) | |||
| if err != nil { | |||
| return 0, nil, err | |||
| } | |||
| n = binary.PutUvarint(buf, fdvOffsetsEnd[i]) | |||
| _, err = s.w.Write(buf[:n]) | |||
| if err != nil { | |||
| return 0, nil, err | |||
| } | |||
| } | |||
| return fdvIndexOffset, dictOffsets, nil | |||
| } | |||
| func encodeFieldType(f document.Field) byte { | |||
| fieldType := byte('x') | |||
| switch f.(type) { | |||
| case *document.TextField: | |||
| fieldType = 't' | |||
| case *document.NumericField: | |||
| fieldType = 'n' | |||
| case *document.DateTimeField: | |||
| fieldType = 'd' | |||
| case *document.BooleanField: | |||
| fieldType = 'b' | |||
| case *document.GeoPointField: | |||
| fieldType = 'g' | |||
| case *document.CompositeField: | |||
| fieldType = 'c' | |||
| } | |||
| return fieldType | |||
| } | |||
| // returns the total # of bytes needed to encode the given uint64's | |||
| // into binary.PutUVarint() encoding | |||
| func totalUvarintBytes(a, b, c, d, e uint64, more []uint64) (n int) { | |||
| n = numUvarintBytes(a) | |||
| n += numUvarintBytes(b) | |||
| n += numUvarintBytes(c) | |||
| n += numUvarintBytes(d) | |||
| n += numUvarintBytes(e) | |||
| for _, v := range more { | |||
| n += numUvarintBytes(v) | |||
| } | |||
| return n | |||
| } | |||
| // returns # of bytes needed to encode x in binary.PutUvarint() encoding | |||
| func numUvarintBytes(x uint64) (n int) { | |||
| for x >= 0x80 { | |||
| x >>= 7 | |||
| n++ | |||
| } | |||
| return n + 1 | |||
| } | |||
| @@ -20,16 +20,24 @@ import ( | |||
| "fmt" | |||
| "io" | |||
| "os" | |||
| "reflect" | |||
| "sync" | |||
| "github.com/RoaringBitmap/roaring" | |||
| "github.com/Smerity/govarint" | |||
| "github.com/blevesearch/bleve/index/scorch/segment" | |||
| "github.com/blevesearch/bleve/size" | |||
| "github.com/couchbase/vellum" | |||
| mmap "github.com/edsrzf/mmap-go" | |||
| "github.com/golang/snappy" | |||
| ) | |||
| var reflectStaticSizeSegmentBase int | |||
| func init() { | |||
| var sb SegmentBase | |||
| reflectStaticSizeSegmentBase = int(reflect.TypeOf(sb).Size()) | |||
| } | |||
| // Open returns a zap impl of a segment | |||
| func Open(path string) (segment.Segment, error) { | |||
| f, err := os.Open(path) | |||
| @@ -47,13 +55,14 @@ func Open(path string) (segment.Segment, error) { | |||
| SegmentBase: SegmentBase{ | |||
| mem: mm[0 : len(mm)-FooterSize], | |||
| fieldsMap: make(map[string]uint16), | |||
| fieldDvIterMap: make(map[uint16]*docValueIterator), | |||
| fieldDvReaders: make(map[uint16]*docValueReader), | |||
| }, | |||
| f: f, | |||
| mm: mm, | |||
| path: path, | |||
| refs: 1, | |||
| } | |||
| rv.SegmentBase.updateSize() | |||
| err = rv.loadConfig() | |||
| if err != nil { | |||
| @@ -67,7 +76,7 @@ func Open(path string) (segment.Segment, error) { | |||
| return nil, err | |||
| } | |||
| err = rv.loadDvIterators() | |||
| err = rv.loadDvReaders() | |||
| if err != nil { | |||
| _ = rv.Close() | |||
| return nil, err | |||
| @@ -89,7 +98,39 @@ type SegmentBase struct { | |||
| fieldsIndexOffset uint64 | |||
| docValueOffset uint64 | |||
| dictLocs []uint64 | |||
| fieldDvIterMap map[uint16]*docValueIterator // naive chunk cache per field | |||
| fieldDvReaders map[uint16]*docValueReader // naive chunk cache per field | |||
| fieldDvNames []string // field names cached in fieldDvReaders | |||
| size uint64 | |||
| } | |||
| func (sb *SegmentBase) Size() int { | |||
| return int(sb.size) | |||
| } | |||
| func (sb *SegmentBase) updateSize() { | |||
| sizeInBytes := reflectStaticSizeSegmentBase + | |||
| cap(sb.mem) | |||
| // fieldsMap | |||
| for k, _ := range sb.fieldsMap { | |||
| sizeInBytes += (len(k) + size.SizeOfString) + size.SizeOfUint16 | |||
| } | |||
| // fieldsInv, dictLocs | |||
| for _, entry := range sb.fieldsInv { | |||
| sizeInBytes += len(entry) + size.SizeOfString | |||
| } | |||
| sizeInBytes += len(sb.dictLocs) * size.SizeOfUint64 | |||
| // fieldDvReaders | |||
| for _, v := range sb.fieldDvReaders { | |||
| sizeInBytes += size.SizeOfUint16 + size.SizeOfPtr | |||
| if v != nil { | |||
| sizeInBytes += v.size() | |||
| } | |||
| } | |||
| sb.size = uint64(sizeInBytes) | |||
| } | |||
| func (sb *SegmentBase) AddRef() {} | |||
| @@ -111,56 +152,19 @@ type Segment struct { | |||
| refs int64 | |||
| } | |||
| func (s *Segment) SizeInBytes() uint64 { | |||
| func (s *Segment) Size() int { | |||
| // 8 /* size of file pointer */ | |||
| // 4 /* size of version -> uint32 */ | |||
| // 4 /* size of crc -> uint32 */ | |||
| sizeOfUints := 16 | |||
| sizeInBytes := (len(s.path) + int(segment.SizeOfString)) + sizeOfUints | |||
| sizeInBytes := (len(s.path) + size.SizeOfString) + sizeOfUints | |||
| // mutex, refs -> int64 | |||
| sizeInBytes += 16 | |||
| // do not include the mmap'ed part | |||
| return uint64(sizeInBytes) + s.SegmentBase.SizeInBytes() - uint64(len(s.mem)) | |||
| } | |||
| func (s *SegmentBase) SizeInBytes() uint64 { | |||
| // 4 /* size of memCRC -> uint32 */ | |||
| // 4 /* size of chunkFactor -> uint32 */ | |||
| // 8 /* size of numDocs -> uint64 */ | |||
| // 8 /* size of storedIndexOffset -> uint64 */ | |||
| // 8 /* size of fieldsIndexOffset -> uint64 */ | |||
| // 8 /* size of docValueOffset -> uint64 */ | |||
| sizeInBytes := 40 | |||
| sizeInBytes += len(s.mem) + int(segment.SizeOfSlice) | |||
| // fieldsMap | |||
| for k, _ := range s.fieldsMap { | |||
| sizeInBytes += (len(k) + int(segment.SizeOfString)) + 2 /* size of uint16 */ | |||
| } | |||
| sizeInBytes += int(segment.SizeOfMap) /* overhead from map */ | |||
| // fieldsInv, dictLocs | |||
| for _, entry := range s.fieldsInv { | |||
| sizeInBytes += (len(entry) + int(segment.SizeOfString)) | |||
| } | |||
| sizeInBytes += len(s.dictLocs) * 8 /* size of uint64 */ | |||
| sizeInBytes += int(segment.SizeOfSlice) * 3 /* overhead from slices */ | |||
| // fieldDvIterMap | |||
| sizeInBytes += len(s.fieldDvIterMap) * | |||
| int(segment.SizeOfPointer+2 /* size of uint16 */) | |||
| for _, entry := range s.fieldDvIterMap { | |||
| if entry != nil { | |||
| sizeInBytes += int(entry.sizeInBytes()) | |||
| } | |||
| } | |||
| sizeInBytes += int(segment.SizeOfMap) | |||
| return uint64(sizeInBytes) | |||
| return sizeInBytes + s.SegmentBase.Size() - cap(s.mem) | |||
| } | |||
| func (s *Segment) AddRef() { | |||
| @@ -185,7 +189,7 @@ func (s *Segment) loadConfig() error { | |||
| verOffset := crcOffset - 4 | |||
| s.version = binary.BigEndian.Uint32(s.mm[verOffset : verOffset+4]) | |||
| if s.version != version { | |||
| if s.version != Version { | |||
| return fmt.Errorf("unsupported version %d", s.version) | |||
| } | |||
| @@ -207,7 +211,7 @@ func (s *Segment) loadConfig() error { | |||
| } | |||
| func (s *SegmentBase) loadFields() error { | |||
| // NOTE for now we assume the fields index immediately preceeds | |||
| // NOTE for now we assume the fields index immediately precedes | |||
| // the footer, and if this changes, need to adjust accordingly (or | |||
| // store explicit length), where s.mem was sliced from s.mm in Open(). | |||
| fieldsIndexEnd := uint64(len(s.mem)) | |||
| @@ -262,6 +266,10 @@ func (sb *SegmentBase) dictionary(field string) (rv *Dictionary, err error) { | |||
| if err != nil { | |||
| return nil, fmt.Errorf("dictionary field %s vellum err: %v", field, err) | |||
| } | |||
| rv.fstReader, err = rv.fst.Reader() | |||
| if err != nil { | |||
| return nil, fmt.Errorf("dictionary field %s vellum reader err: %v", field, err) | |||
| } | |||
| } | |||
| } | |||
| } | |||
| @@ -269,50 +277,90 @@ func (sb *SegmentBase) dictionary(field string) (rv *Dictionary, err error) { | |||
| return rv, nil | |||
| } | |||
| // visitDocumentCtx holds data structures that are reusable across | |||
| // multiple VisitDocument() calls to avoid memory allocations | |||
| type visitDocumentCtx struct { | |||
| buf []byte | |||
| reader bytes.Reader | |||
| arrayPos []uint64 | |||
| } | |||
| var visitDocumentCtxPool = sync.Pool{ | |||
| New: func() interface{} { | |||
| reuse := &visitDocumentCtx{} | |||
| return reuse | |||
| }, | |||
| } | |||
| // VisitDocument invokes the DocFieldValueVistor for each stored field | |||
| // for the specified doc number | |||
| func (s *SegmentBase) VisitDocument(num uint64, visitor segment.DocumentFieldValueVisitor) error { | |||
| vdc := visitDocumentCtxPool.Get().(*visitDocumentCtx) | |||
| defer visitDocumentCtxPool.Put(vdc) | |||
| return s.visitDocument(vdc, num, visitor) | |||
| } | |||
| func (s *SegmentBase) visitDocument(vdc *visitDocumentCtx, num uint64, | |||
| visitor segment.DocumentFieldValueVisitor) error { | |||
| // first make sure this is a valid number in this segment | |||
| if num < s.numDocs { | |||
| meta, compressed := s.getDocStoredMetaAndCompressed(num) | |||
| uncompressed, err := snappy.Decode(nil, compressed) | |||
| vdc.reader.Reset(meta) | |||
| // handle _id field special case | |||
| idFieldValLen, err := binary.ReadUvarint(&vdc.reader) | |||
| if err != nil { | |||
| return err | |||
| } | |||
| idFieldVal := compressed[:idFieldValLen] | |||
| keepGoing := visitor("_id", byte('t'), idFieldVal, nil) | |||
| if !keepGoing { | |||
| visitDocumentCtxPool.Put(vdc) | |||
| return nil | |||
| } | |||
| // handle non-"_id" fields | |||
| compressed = compressed[idFieldValLen:] | |||
| uncompressed, err := snappy.Decode(vdc.buf[:cap(vdc.buf)], compressed) | |||
| if err != nil { | |||
| return err | |||
| } | |||
| // now decode meta and process | |||
| reader := bytes.NewReader(meta) | |||
| decoder := govarint.NewU64Base128Decoder(reader) | |||
| keepGoing := true | |||
| for keepGoing { | |||
| field, err := decoder.GetU64() | |||
| field, err := binary.ReadUvarint(&vdc.reader) | |||
| if err == io.EOF { | |||
| break | |||
| } | |||
| if err != nil { | |||
| return err | |||
| } | |||
| typ, err := decoder.GetU64() | |||
| typ, err := binary.ReadUvarint(&vdc.reader) | |||
| if err != nil { | |||
| return err | |||
| } | |||
| offset, err := decoder.GetU64() | |||
| offset, err := binary.ReadUvarint(&vdc.reader) | |||
| if err != nil { | |||
| return err | |||
| } | |||
| l, err := decoder.GetU64() | |||
| l, err := binary.ReadUvarint(&vdc.reader) | |||
| if err != nil { | |||
| return err | |||
| } | |||
| numap, err := decoder.GetU64() | |||
| numap, err := binary.ReadUvarint(&vdc.reader) | |||
| if err != nil { | |||
| return err | |||
| } | |||
| var arrayPos []uint64 | |||
| if numap > 0 { | |||
| arrayPos = make([]uint64, numap) | |||
| if cap(vdc.arrayPos) < int(numap) { | |||
| vdc.arrayPos = make([]uint64, numap) | |||
| } | |||
| arrayPos = vdc.arrayPos[:numap] | |||
| for i := 0; i < int(numap); i++ { | |||
| ap, err := decoder.GetU64() | |||
| ap, err := binary.ReadUvarint(&vdc.reader) | |||
| if err != nil { | |||
| return err | |||
| } | |||
| @@ -323,10 +371,36 @@ func (s *SegmentBase) VisitDocument(num uint64, visitor segment.DocumentFieldVal | |||
| value := uncompressed[offset : offset+l] | |||
| keepGoing = visitor(s.fieldsInv[field], byte(typ), value, arrayPos) | |||
| } | |||
| vdc.buf = uncompressed | |||
| } | |||
| return nil | |||
| } | |||
| // DocID returns the value of the _id field for the given docNum | |||
| func (s *SegmentBase) DocID(num uint64) ([]byte, error) { | |||
| if num >= s.numDocs { | |||
| return nil, nil | |||
| } | |||
| vdc := visitDocumentCtxPool.Get().(*visitDocumentCtx) | |||
| meta, compressed := s.getDocStoredMetaAndCompressed(num) | |||
| vdc.reader.Reset(meta) | |||
| // handle _id field special case | |||
| idFieldValLen, err := binary.ReadUvarint(&vdc.reader) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| idFieldVal := compressed[:idFieldValLen] | |||
| visitDocumentCtxPool.Put(vdc) | |||
| return idFieldVal, nil | |||
| } | |||
| // Count returns the number of documents in this segment. | |||
| func (s *SegmentBase) Count() uint64 { | |||
| return s.numDocs | |||
| @@ -343,15 +417,26 @@ func (s *SegmentBase) DocNumbers(ids []string) (*roaring.Bitmap, error) { | |||
| return nil, err | |||
| } | |||
| var postings *PostingsList | |||
| postingsList := emptyPostingsList | |||
| sMax, err := idDict.fst.GetMaxKey() | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| sMaxStr := string(sMax) | |||
| filteredIds := make([]string, 0, len(ids)) | |||
| for _, id := range ids { | |||
| postings, err = idDict.postingsList([]byte(id), nil, postings) | |||
| if id <= sMaxStr { | |||
| filteredIds = append(filteredIds, id) | |||
| } | |||
| } | |||
| for _, id := range filteredIds { | |||
| postingsList, err = idDict.postingsList([]byte(id), nil, postingsList) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| if postings.postings != nil { | |||
| rv.Or(postings.postings) | |||
| } | |||
| postingsList.OrInto(rv) | |||
| } | |||
| } | |||
| @@ -441,19 +526,32 @@ func (s *Segment) DictAddr(field string) (uint64, error) { | |||
| return s.dictLocs[fieldIDPlus1-1], nil | |||
| } | |||
| func (s *SegmentBase) loadDvIterators() error { | |||
| func (s *SegmentBase) loadDvReaders() error { | |||
| if s.docValueOffset == fieldNotUninverted { | |||
| return nil | |||
| } | |||
| var read uint64 | |||
| for fieldID, field := range s.fieldsInv { | |||
| fieldLoc, n := binary.Uvarint(s.mem[s.docValueOffset+read : s.docValueOffset+read+binary.MaxVarintLen64]) | |||
| var fieldLocStart, fieldLocEnd uint64 | |||
| var n int | |||
| fieldLocStart, n = binary.Uvarint(s.mem[s.docValueOffset+read : s.docValueOffset+read+binary.MaxVarintLen64]) | |||
| if n <= 0 { | |||
| return fmt.Errorf("loadDvIterators: failed to read the docvalue offsets for field %d", fieldID) | |||
| return fmt.Errorf("loadDvReaders: failed to read the docvalue offset start for field %d", fieldID) | |||
| } | |||
| s.fieldDvIterMap[uint16(fieldID)], _ = s.loadFieldDocValueIterator(field, fieldLoc) | |||
| read += uint64(n) | |||
| fieldLocEnd, n = binary.Uvarint(s.mem[s.docValueOffset+read : s.docValueOffset+read+binary.MaxVarintLen64]) | |||
| if n <= 0 { | |||
| return fmt.Errorf("loadDvReaders: failed to read the docvalue offset end for field %d", fieldID) | |||
| } | |||
| read += uint64(n) | |||
| fieldDvReader, _ := s.loadFieldDocValueReader(field, fieldLocStart, fieldLocEnd) | |||
| if fieldDvReader != nil { | |||
| s.fieldDvReaders[uint16(fieldID)] = fieldDvReader | |||
| s.fieldDvNames = append(s.fieldDvNames, field) | |||
| } | |||
| } | |||
| return nil | |||
| } | |||
| @@ -15,7 +15,6 @@ | |||
| package zap | |||
| import ( | |||
| "bytes" | |||
| "encoding/binary" | |||
| "io" | |||
| @@ -25,28 +24,29 @@ import ( | |||
| // writes out the length of the roaring bitmap in bytes as varint | |||
| // then writes out the roaring bitmap itself | |||
| func writeRoaringWithLen(r *roaring.Bitmap, w io.Writer, | |||
| reuseBuf *bytes.Buffer, reuseBufVarint []byte) (int, error) { | |||
| reuseBuf.Reset() | |||
| // write out postings list to memory so we know the len | |||
| postingsListLen, err := r.WriteTo(reuseBuf) | |||
| reuseBufVarint []byte) (int, error) { | |||
| buf, err := r.ToBytes() | |||
| if err != nil { | |||
| return 0, err | |||
| } | |||
| var tw int | |||
| // write out the length of this postings list | |||
| n := binary.PutUvarint(reuseBufVarint, uint64(postingsListLen)) | |||
| // write out the length | |||
| n := binary.PutUvarint(reuseBufVarint, uint64(len(buf))) | |||
| nw, err := w.Write(reuseBufVarint[:n]) | |||
| tw += nw | |||
| if err != nil { | |||
| return tw, err | |||
| } | |||
| // write out the postings list itself | |||
| nw, err = w.Write(reuseBuf.Bytes()) | |||
| // write out the roaring bytes | |||
| nw, err = w.Write(buf) | |||
| tw += nw | |||
| if err != nil { | |||
| return tw, err | |||
| } | |||
| return tw, nil | |||
| } | |||
| @@ -118,7 +118,7 @@ func persistFooter(numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset | |||
| return err | |||
| } | |||
| // write out 32-bit version | |||
| err = binary.Write(w, binary.BigEndian, version) | |||
| err = binary.Write(w, binary.BigEndian, Version) | |||
| if err != nil { | |||
| return err | |||
| } | |||
| @@ -15,10 +15,10 @@ | |||
| package scorch | |||
| import ( | |||
| "bytes" | |||
| "container/heap" | |||
| "encoding/binary" | |||
| "fmt" | |||
| "reflect" | |||
| "sort" | |||
| "sync" | |||
| "sync/atomic" | |||
| @@ -27,8 +27,13 @@ import ( | |||
| "github.com/blevesearch/bleve/document" | |||
| "github.com/blevesearch/bleve/index" | |||
| "github.com/blevesearch/bleve/index/scorch/segment" | |||
| "github.com/couchbase/vellum" | |||
| lev2 "github.com/couchbase/vellum/levenshtein2" | |||
| ) | |||
| // re usable, threadsafe levenshtein builders | |||
| var lb1, lb2 *lev2.LevenshteinAutomatonBuilder | |||
| type asynchSegmentResult struct { | |||
| dictItr segment.DictionaryIterator | |||
| @@ -40,15 +45,36 @@ type asynchSegmentResult struct { | |||
| err error | |||
| } | |||
| var reflectStaticSizeIndexSnapshot int | |||
| func init() { | |||
| var is interface{} = IndexSnapshot{} | |||
| reflectStaticSizeIndexSnapshot = int(reflect.TypeOf(is).Size()) | |||
| var err error | |||
| lb1, err = lev2.NewLevenshteinAutomatonBuilder(1, true) | |||
| if err != nil { | |||
| panic(fmt.Errorf("Levenshtein automaton ed1 builder err: %v", err)) | |||
| } | |||
| lb2, err = lev2.NewLevenshteinAutomatonBuilder(2, true) | |||
| if err != nil { | |||
| panic(fmt.Errorf("Levenshtein automaton ed2 builder err: %v", err)) | |||
| } | |||
| } | |||
| type IndexSnapshot struct { | |||
| parent *Scorch | |||
| segment []*SegmentSnapshot | |||
| offsets []uint64 | |||
| internal map[string][]byte | |||
| epoch uint64 | |||
| size uint64 | |||
| creator string | |||
| m sync.Mutex // Protects the fields that follow. | |||
| refs int64 | |||
| m2 sync.Mutex // Protects the fields that follow. | |||
| fieldTFRs map[string][]*IndexSnapshotTermFieldReader // keyed by field, recycled TFR's | |||
| } | |||
| func (i *IndexSnapshot) Segments() []*SegmentSnapshot { | |||
| @@ -85,12 +111,27 @@ func (i *IndexSnapshot) DecRef() (err error) { | |||
| return err | |||
| } | |||
| func (i *IndexSnapshot) Close() error { | |||
| return i.DecRef() | |||
| } | |||
| func (i *IndexSnapshot) Size() int { | |||
| return int(i.size) | |||
| } | |||
| func (i *IndexSnapshot) updateSize() { | |||
| i.size += uint64(reflectStaticSizeIndexSnapshot) | |||
| for _, s := range i.segment { | |||
| i.size += uint64(s.Size()) | |||
| } | |||
| } | |||
| func (i *IndexSnapshot) newIndexSnapshotFieldDict(field string, makeItr func(i segment.TermDictionary) segment.DictionaryIterator) (*IndexSnapshotFieldDict, error) { | |||
| results := make(chan *asynchSegmentResult) | |||
| for index, segment := range i.segment { | |||
| go func(index int, segment *SegmentSnapshot) { | |||
| dict, err := segment.Dictionary(field) | |||
| dict, err := segment.segment.Dictionary(field) | |||
| if err != nil { | |||
| results <- &asynchSegmentResult{err: err} | |||
| } else { | |||
| @@ -116,7 +157,7 @@ func (i *IndexSnapshot) newIndexSnapshotFieldDict(field string, makeItr func(i s | |||
| if next != nil { | |||
| rv.cursors = append(rv.cursors, &segmentDictCursor{ | |||
| itr: asr.dictItr, | |||
| curr: next, | |||
| curr: *next, | |||
| }) | |||
| } | |||
| } | |||
| @@ -151,6 +192,56 @@ func (i *IndexSnapshot) FieldDictPrefix(field string, | |||
| }) | |||
| } | |||
| func (i *IndexSnapshot) FieldDictRegexp(field string, | |||
| termRegex string) (index.FieldDict, error) { | |||
| // TODO: potential optimization where the literal prefix represents the, | |||
| // entire regexp, allowing us to use PrefixIterator(prefixTerm)? | |||
| a, prefixBeg, prefixEnd, err := segment.ParseRegexp(termRegex) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator { | |||
| return i.AutomatonIterator(a, prefixBeg, prefixEnd) | |||
| }) | |||
| } | |||
| func (i *IndexSnapshot) getLevAutomaton(term string, | |||
| fuzziness uint8) (vellum.Automaton, error) { | |||
| if fuzziness == 1 { | |||
| return lb1.BuildDfa(term, fuzziness) | |||
| } else if fuzziness == 2 { | |||
| return lb2.BuildDfa(term, fuzziness) | |||
| } | |||
| return nil, fmt.Errorf("fuzziness exceeds the max limit") | |||
| } | |||
| func (i *IndexSnapshot) FieldDictFuzzy(field string, | |||
| term string, fuzziness int, prefix string) (index.FieldDict, error) { | |||
| a, err := i.getLevAutomaton(term, uint8(fuzziness)) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| var prefixBeg, prefixEnd []byte | |||
| if prefix != "" { | |||
| prefixBeg = []byte(prefix) | |||
| prefixEnd = segment.IncrementBytes(prefixBeg) | |||
| } | |||
| return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator { | |||
| return i.AutomatonIterator(a, prefixBeg, prefixEnd) | |||
| }) | |||
| } | |||
| func (i *IndexSnapshot) FieldDictOnly(field string, | |||
| onlyTerms [][]byte, includeCount bool) (index.FieldDict, error) { | |||
| return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator { | |||
| return i.OnlyIterator(onlyTerms, includeCount) | |||
| }) | |||
| } | |||
| func (i *IndexSnapshot) DocIDReaderAll() (index.DocIDReader, error) { | |||
| results := make(chan *asynchSegmentResult) | |||
| for index, segment := range i.segment { | |||
| @@ -264,21 +355,26 @@ func (i *IndexSnapshot) Document(id string) (rv *document.Document, err error) { | |||
| segmentIndex, localDocNum := i.segmentIndexAndLocalDocNumFromGlobal(docNum) | |||
| rv = document.NewDocument(id) | |||
| err = i.segment[segmentIndex].VisitDocument(localDocNum, func(name string, typ byte, value []byte, pos []uint64) bool { | |||
| err = i.segment[segmentIndex].VisitDocument(localDocNum, func(name string, typ byte, val []byte, pos []uint64) bool { | |||
| if name == "_id" { | |||
| return true | |||
| } | |||
| // copy value, array positions to preserve them beyond the scope of this callback | |||
| value := append([]byte(nil), val...) | |||
| arrayPos := append([]uint64(nil), pos...) | |||
| switch typ { | |||
| case 't': | |||
| rv.AddField(document.NewTextField(name, pos, value)) | |||
| rv.AddField(document.NewTextField(name, arrayPos, value)) | |||
| case 'n': | |||
| rv.AddField(document.NewNumericFieldFromBytes(name, pos, value)) | |||
| rv.AddField(document.NewNumericFieldFromBytes(name, arrayPos, value)) | |||
| case 'd': | |||
| rv.AddField(document.NewDateTimeFieldFromBytes(name, pos, value)) | |||
| rv.AddField(document.NewDateTimeFieldFromBytes(name, arrayPos, value)) | |||
| case 'b': | |||
| rv.AddField(document.NewBooleanFieldFromBytes(name, pos, value)) | |||
| rv.AddField(document.NewBooleanFieldFromBytes(name, arrayPos, value)) | |||
| case 'g': | |||
| rv.AddField(document.NewGeoPointFieldFromBytes(name, pos, value)) | |||
| rv.AddField(document.NewGeoPointFieldFromBytes(name, arrayPos, value)) | |||
| } | |||
| return true | |||
| @@ -307,24 +403,15 @@ func (i *IndexSnapshot) ExternalID(id index.IndexInternalID) (string, error) { | |||
| } | |||
| segmentIndex, localDocNum := i.segmentIndexAndLocalDocNumFromGlobal(docNum) | |||
| var found bool | |||
| var rv string | |||
| err = i.segment[segmentIndex].VisitDocument(localDocNum, func(field string, typ byte, value []byte, pos []uint64) bool { | |||
| if field == "_id" { | |||
| found = true | |||
| rv = string(value) | |||
| return false | |||
| } | |||
| return true | |||
| }) | |||
| v, err := i.segment[segmentIndex].DocID(localDocNum) | |||
| if err != nil { | |||
| return "", err | |||
| } | |||
| if found { | |||
| return rv, nil | |||
| if v == nil { | |||
| return "", fmt.Errorf("document number %d not found", docNum) | |||
| } | |||
| return "", fmt.Errorf("document number %d not found", docNum) | |||
| return string(v), nil | |||
| } | |||
| func (i *IndexSnapshot) InternalID(id string) (rv index.IndexInternalID, err error) { | |||
| @@ -349,33 +436,81 @@ func (i *IndexSnapshot) InternalID(id string) (rv index.IndexInternalID, err err | |||
| func (i *IndexSnapshot) TermFieldReader(term []byte, field string, includeFreq, | |||
| includeNorm, includeTermVectors bool) (index.TermFieldReader, error) { | |||
| rv := &IndexSnapshotTermFieldReader{ | |||
| term: term, | |||
| field: field, | |||
| snapshot: i, | |||
| postings: make([]segment.PostingsList, len(i.segment)), | |||
| iterators: make([]segment.PostingsIterator, len(i.segment)), | |||
| includeFreq: includeFreq, | |||
| includeNorm: includeNorm, | |||
| includeTermVectors: includeTermVectors, | |||
| rv := i.allocTermFieldReaderDicts(field) | |||
| rv.term = term | |||
| rv.field = field | |||
| rv.snapshot = i | |||
| if rv.postings == nil { | |||
| rv.postings = make([]segment.PostingsList, len(i.segment)) | |||
| } | |||
| if rv.iterators == nil { | |||
| rv.iterators = make([]segment.PostingsIterator, len(i.segment)) | |||
| } | |||
| rv.segmentOffset = 0 | |||
| rv.includeFreq = includeFreq | |||
| rv.includeNorm = includeNorm | |||
| rv.includeTermVectors = includeTermVectors | |||
| rv.currPosting = nil | |||
| rv.currID = rv.currID[:0] | |||
| if rv.dicts == nil { | |||
| rv.dicts = make([]segment.TermDictionary, len(i.segment)) | |||
| for i, segment := range i.segment { | |||
| dict, err := segment.segment.Dictionary(field) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| rv.dicts[i] = dict | |||
| } | |||
| } | |||
| for i, segment := range i.segment { | |||
| dict, err := segment.Dictionary(field) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| pl, err := dict.PostingsList(string(term), nil) | |||
| pl, err := rv.dicts[i].PostingsList(term, segment.deleted, rv.postings[i]) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| rv.postings[i] = pl | |||
| rv.iterators[i] = pl.Iterator() | |||
| rv.iterators[i] = pl.Iterator(includeFreq, includeNorm, includeTermVectors, rv.iterators[i]) | |||
| } | |||
| atomic.AddUint64(&i.parent.stats.termSearchersStarted, uint64(1)) | |||
| atomic.AddUint64(&i.parent.stats.TotTermSearchersStarted, uint64(1)) | |||
| return rv, nil | |||
| } | |||
| func (i *IndexSnapshot) allocTermFieldReaderDicts(field string) (tfr *IndexSnapshotTermFieldReader) { | |||
| i.m2.Lock() | |||
| if i.fieldTFRs != nil { | |||
| tfrs := i.fieldTFRs[field] | |||
| last := len(tfrs) - 1 | |||
| if last >= 0 { | |||
| tfr = tfrs[last] | |||
| tfrs[last] = nil | |||
| i.fieldTFRs[field] = tfrs[:last] | |||
| i.m2.Unlock() | |||
| return | |||
| } | |||
| } | |||
| i.m2.Unlock() | |||
| return &IndexSnapshotTermFieldReader{} | |||
| } | |||
| func (i *IndexSnapshot) recycleTermFieldReader(tfr *IndexSnapshotTermFieldReader) { | |||
| i.parent.rootLock.RLock() | |||
| obsolete := i.parent.root != i | |||
| i.parent.rootLock.RUnlock() | |||
| if obsolete { | |||
| // if we're not the current root (mutations happened), don't bother recycling | |||
| return | |||
| } | |||
| i.m2.Lock() | |||
| if i.fieldTFRs == nil { | |||
| i.fieldTFRs = map[string][]*IndexSnapshotTermFieldReader{} | |||
| } | |||
| i.fieldTFRs[tfr.field] = append(i.fieldTFRs[tfr.field], tfr) | |||
| i.m2.Unlock() | |||
| } | |||
| func docNumberToBytes(buf []byte, in uint64) []byte { | |||
| if len(buf) != 8 { | |||
| if cap(buf) >= 8 { | |||
| @@ -389,115 +524,172 @@ func docNumberToBytes(buf []byte, in uint64) []byte { | |||
| } | |||
| func docInternalToNumber(in index.IndexInternalID) (uint64, error) { | |||
| var res uint64 | |||
| err := binary.Read(bytes.NewReader(in), binary.BigEndian, &res) | |||
| if err != nil { | |||
| return 0, err | |||
| if len(in) != 8 { | |||
| return 0, fmt.Errorf("wrong len for IndexInternalID: %q", in) | |||
| } | |||
| return res, nil | |||
| return binary.BigEndian.Uint64(in), nil | |||
| } | |||
| func (i *IndexSnapshot) DocumentVisitFieldTerms(id index.IndexInternalID, | |||
| fields []string, visitor index.DocumentFieldTermVisitor) error { | |||
| _, err := i.documentVisitFieldTerms(id, fields, visitor, nil) | |||
| return err | |||
| } | |||
| func (i *IndexSnapshot) documentVisitFieldTerms(id index.IndexInternalID, | |||
| fields []string, visitor index.DocumentFieldTermVisitor, | |||
| dvs segment.DocVisitState) (segment.DocVisitState, error) { | |||
| docNum, err := docInternalToNumber(id) | |||
| if err != nil { | |||
| return err | |||
| return nil, err | |||
| } | |||
| segmentIndex, localDocNum := i.segmentIndexAndLocalDocNumFromGlobal(docNum) | |||
| if segmentIndex >= len(i.segment) { | |||
| return nil | |||
| return nil, nil | |||
| } | |||
| _, dvs, err = i.documentVisitFieldTermsOnSegment( | |||
| segmentIndex, localDocNum, fields, nil, visitor, dvs) | |||
| return dvs, err | |||
| } | |||
| func (i *IndexSnapshot) documentVisitFieldTermsOnSegment( | |||
| segmentIndex int, localDocNum uint64, fields []string, cFields []string, | |||
| visitor index.DocumentFieldTermVisitor, dvs segment.DocVisitState) ( | |||
| cFieldsOut []string, dvsOut segment.DocVisitState, err error) { | |||
| ss := i.segment[segmentIndex] | |||
| if zaps, ok := ss.segment.(segment.DocumentFieldTermVisitable); ok { | |||
| // get the list of doc value persisted fields | |||
| pFields, err := zaps.VisitableDocValueFields() | |||
| var vFields []string // fields that are visitable via the segment | |||
| ssv, ssvOk := ss.segment.(segment.DocumentFieldTermVisitable) | |||
| if ssvOk && ssv != nil { | |||
| vFields, err = ssv.VisitableDocValueFields() | |||
| if err != nil { | |||
| return err | |||
| } | |||
| // assort the fields for which terms look up have to | |||
| // be performed runtime | |||
| dvPendingFields := extractDvPendingFields(fields, pFields) | |||
| if len(dvPendingFields) == 0 { | |||
| // all fields are doc value persisted | |||
| return zaps.VisitDocumentFieldTerms(localDocNum, fields, visitor) | |||
| return nil, nil, err | |||
| } | |||
| } | |||
| // concurrently trigger the runtime doc value preparations for | |||
| // pending fields as well as the visit of the persisted doc values | |||
| errCh := make(chan error, 1) | |||
| var errCh chan error | |||
| go func() { | |||
| defer close(errCh) | |||
| err := ss.cachedDocs.prepareFields(fields, ss) | |||
| if err != nil { | |||
| errCh <- err | |||
| } | |||
| }() | |||
| // cFields represents the fields that we'll need from the | |||
| // cachedDocs, and might be optionally be provided by the caller, | |||
| // if the caller happens to know we're on the same segmentIndex | |||
| // from a previous invocation | |||
| if cFields == nil { | |||
| cFields = subtractStrings(fields, vFields) | |||
| if !ss.cachedDocs.hasFields(cFields) { | |||
| errCh = make(chan error, 1) | |||
| go func() { | |||
| err := ss.cachedDocs.prepareFields(cFields, ss) | |||
| if err != nil { | |||
| errCh <- err | |||
| } | |||
| close(errCh) | |||
| }() | |||
| } | |||
| } | |||
| // visit the persisted dv while the cache preparation is in progress | |||
| err = zaps.VisitDocumentFieldTerms(localDocNum, fields, visitor) | |||
| if ssvOk && ssv != nil && len(vFields) > 0 { | |||
| dvs, err = ssv.VisitDocumentFieldTerms(localDocNum, fields, visitor, dvs) | |||
| if err != nil { | |||
| return err | |||
| return nil, nil, err | |||
| } | |||
| } | |||
| // err out if fieldCache preparation failed | |||
| if errCh != nil { | |||
| err = <-errCh | |||
| if err != nil { | |||
| return err | |||
| return nil, nil, err | |||
| } | |||
| } | |||
| visitDocumentFieldCacheTerms(localDocNum, dvPendingFields, ss, visitor) | |||
| return nil | |||
| if len(cFields) > 0 { | |||
| ss.cachedDocs.visitDoc(localDocNum, cFields, visitor) | |||
| } | |||
| return prepareCacheVisitDocumentFieldTerms(localDocNum, fields, ss, visitor) | |||
| return cFields, dvs, nil | |||
| } | |||
| func (i *IndexSnapshot) DocValueReader(fields []string) ( | |||
| index.DocValueReader, error) { | |||
| return &DocValueReader{i: i, fields: fields, currSegmentIndex: -1}, nil | |||
| } | |||
| type DocValueReader struct { | |||
| i *IndexSnapshot | |||
| fields []string | |||
| dvs segment.DocVisitState | |||
| currSegmentIndex int | |||
| currCachedFields []string | |||
| } | |||
| func prepareCacheVisitDocumentFieldTerms(localDocNum uint64, fields []string, | |||
| ss *SegmentSnapshot, visitor index.DocumentFieldTermVisitor) error { | |||
| err := ss.cachedDocs.prepareFields(fields, ss) | |||
| func (dvr *DocValueReader) VisitDocValues(id index.IndexInternalID, | |||
| visitor index.DocumentFieldTermVisitor) (err error) { | |||
| docNum, err := docInternalToNumber(id) | |||
| if err != nil { | |||
| return err | |||
| } | |||
| visitDocumentFieldCacheTerms(localDocNum, fields, ss, visitor) | |||
| return nil | |||
| segmentIndex, localDocNum := dvr.i.segmentIndexAndLocalDocNumFromGlobal(docNum) | |||
| if segmentIndex >= len(dvr.i.segment) { | |||
| return nil | |||
| } | |||
| if dvr.currSegmentIndex != segmentIndex { | |||
| dvr.currSegmentIndex = segmentIndex | |||
| dvr.currCachedFields = nil | |||
| } | |||
| dvr.currCachedFields, dvr.dvs, err = dvr.i.documentVisitFieldTermsOnSegment( | |||
| dvr.currSegmentIndex, localDocNum, dvr.fields, dvr.currCachedFields, visitor, dvr.dvs) | |||
| return err | |||
| } | |||
| func visitDocumentFieldCacheTerms(localDocNum uint64, fields []string, | |||
| ss *SegmentSnapshot, visitor index.DocumentFieldTermVisitor) { | |||
| func (i *IndexSnapshot) DumpAll() chan interface{} { | |||
| rv := make(chan interface{}) | |||
| go func() { | |||
| close(rv) | |||
| }() | |||
| return rv | |||
| } | |||
| for _, field := range fields { | |||
| if cachedFieldDocs, exists := ss.cachedDocs.cache[field]; exists { | |||
| if tlist, exists := cachedFieldDocs.docs[localDocNum]; exists { | |||
| for { | |||
| i := bytes.Index(tlist, TermSeparatorSplitSlice) | |||
| if i < 0 { | |||
| break | |||
| } | |||
| visitor(field, tlist[0:i]) | |||
| tlist = tlist[i+1:] | |||
| } | |||
| } | |||
| } | |||
| } | |||
| func (i *IndexSnapshot) DumpDoc(id string) chan interface{} { | |||
| rv := make(chan interface{}) | |||
| go func() { | |||
| close(rv) | |||
| }() | |||
| return rv | |||
| } | |||
| func (i *IndexSnapshot) DumpFields() chan interface{} { | |||
| rv := make(chan interface{}) | |||
| go func() { | |||
| close(rv) | |||
| }() | |||
| return rv | |||
| } | |||
| func extractDvPendingFields(requestedFields, persistedFields []string) []string { | |||
| removeMap := map[string]struct{}{} | |||
| for _, str := range persistedFields { | |||
| removeMap[str] = struct{}{} | |||
| // subtractStrings returns set a minus elements of set b. | |||
| func subtractStrings(a, b []string) []string { | |||
| if len(b) == 0 { | |||
| return a | |||
| } | |||
| rv := make([]string, 0, len(requestedFields)) | |||
| for _, s := range requestedFields { | |||
| if _, ok := removeMap[s]; !ok { | |||
| rv = append(rv, s) | |||
| rv := make([]string, 0, len(a)) | |||
| OUTER: | |||
| for _, as := range a { | |||
| for _, bs := range b { | |||
| if as == bs { | |||
| continue OUTER | |||
| } | |||
| } | |||
| rv = append(rv, as) | |||
| } | |||
| return rv | |||
| } | |||
| @@ -23,12 +23,13 @@ import ( | |||
| type segmentDictCursor struct { | |||
| itr segment.DictionaryIterator | |||
| curr *index.DictEntry | |||
| curr index.DictEntry | |||
| } | |||
| type IndexSnapshotFieldDict struct { | |||
| snapshot *IndexSnapshot | |||
| cursors []*segmentDictCursor | |||
| entry index.DictEntry | |||
| } | |||
| func (i *IndexSnapshotFieldDict) Len() int { return len(i.cursors) } | |||
| @@ -51,10 +52,10 @@ func (i *IndexSnapshotFieldDict) Pop() interface{} { | |||
| } | |||
| func (i *IndexSnapshotFieldDict) Next() (*index.DictEntry, error) { | |||
| if len(i.cursors) <= 0 { | |||
| if len(i.cursors) == 0 { | |||
| return nil, nil | |||
| } | |||
| rv := i.cursors[0].curr | |||
| i.entry = i.cursors[0].curr | |||
| next, err := i.cursors[0].itr.Next() | |||
| if err != nil { | |||
| return nil, err | |||
| @@ -64,12 +65,12 @@ func (i *IndexSnapshotFieldDict) Next() (*index.DictEntry, error) { | |||
| heap.Pop(i) | |||
| } else { | |||
| // modified heap, fix it | |||
| i.cursors[0].curr = next | |||
| i.cursors[0].curr = *next | |||
| heap.Fix(i, 0) | |||
| } | |||
| // look for any other entries with the exact same term | |||
| for len(i.cursors) > 0 && i.cursors[0].curr.Term == rv.Term { | |||
| rv.Count += i.cursors[0].curr.Count | |||
| for len(i.cursors) > 0 && i.cursors[0].curr.Term == i.entry.Term { | |||
| i.entry.Count += i.cursors[0].curr.Count | |||
| next, err := i.cursors[0].itr.Next() | |||
| if err != nil { | |||
| return nil, err | |||
| @@ -79,12 +80,12 @@ func (i *IndexSnapshotFieldDict) Next() (*index.DictEntry, error) { | |||
| heap.Pop(i) | |||
| } else { | |||
| // modified heap, fix it | |||
| i.cursors[0].curr = next | |||
| i.cursors[0].curr = *next | |||
| heap.Fix(i, 0) | |||
| } | |||
| } | |||
| return rv, nil | |||
| return &i.entry, nil | |||
| } | |||
| func (i *IndexSnapshotFieldDict) Close() error { | |||
| @@ -16,17 +16,30 @@ package scorch | |||
| import ( | |||
| "bytes" | |||
| "reflect" | |||
| "github.com/RoaringBitmap/roaring" | |||
| "github.com/blevesearch/bleve/index" | |||
| "github.com/blevesearch/bleve/size" | |||
| ) | |||
| var reflectStaticSizeIndexSnapshotDocIDReader int | |||
| func init() { | |||
| var isdr IndexSnapshotDocIDReader | |||
| reflectStaticSizeIndexSnapshotDocIDReader = int(reflect.TypeOf(isdr).Size()) | |||
| } | |||
| type IndexSnapshotDocIDReader struct { | |||
| snapshot *IndexSnapshot | |||
| iterators []roaring.IntIterable | |||
| segmentOffset int | |||
| } | |||
| func (i *IndexSnapshotDocIDReader) Size() int { | |||
| return reflectStaticSizeIndexSnapshotDocIDReader + size.SizeOfPtr | |||
| } | |||
| func (i *IndexSnapshotDocIDReader) Next() (index.IndexInternalID, error) { | |||
| for i.segmentOffset < len(i.iterators) { | |||
| if !i.iterators[i.segmentOffset].HasNext() { | |||
| @@ -16,16 +16,27 @@ package scorch | |||
| import ( | |||
| "bytes" | |||
| "fmt" | |||
| "reflect" | |||
| "sync/atomic" | |||
| "github.com/blevesearch/bleve/index" | |||
| "github.com/blevesearch/bleve/index/scorch/segment" | |||
| "github.com/blevesearch/bleve/size" | |||
| ) | |||
| var reflectStaticSizeIndexSnapshotTermFieldReader int | |||
| func init() { | |||
| var istfr IndexSnapshotTermFieldReader | |||
| reflectStaticSizeIndexSnapshotTermFieldReader = int(reflect.TypeOf(istfr).Size()) | |||
| } | |||
| type IndexSnapshotTermFieldReader struct { | |||
| term []byte | |||
| field string | |||
| snapshot *IndexSnapshot | |||
| dicts []segment.TermDictionary | |||
| postings []segment.PostingsList | |||
| iterators []segment.PostingsIterator | |||
| segmentOffset int | |||
| @@ -36,13 +47,34 @@ type IndexSnapshotTermFieldReader struct { | |||
| currID index.IndexInternalID | |||
| } | |||
| func (i *IndexSnapshotTermFieldReader) Size() int { | |||
| sizeInBytes := reflectStaticSizeIndexSnapshotTermFieldReader + size.SizeOfPtr + | |||
| len(i.term) + | |||
| len(i.field) + | |||
| len(i.currID) | |||
| for _, entry := range i.postings { | |||
| sizeInBytes += entry.Size() | |||
| } | |||
| for _, entry := range i.iterators { | |||
| sizeInBytes += entry.Size() | |||
| } | |||
| if i.currPosting != nil { | |||
| sizeInBytes += i.currPosting.Size() | |||
| } | |||
| return sizeInBytes | |||
| } | |||
| func (i *IndexSnapshotTermFieldReader) Next(preAlloced *index.TermFieldDoc) (*index.TermFieldDoc, error) { | |||
| rv := preAlloced | |||
| if rv == nil { | |||
| rv = &index.TermFieldDoc{} | |||
| } | |||
| // find the next hit | |||
| for i.segmentOffset < len(i.postings) { | |||
| for i.segmentOffset < len(i.iterators) { | |||
| next, err := i.iterators[i.segmentOffset].Next() | |||
| if err != nil { | |||
| return nil, err | |||
| @@ -72,9 +104,16 @@ func (i *IndexSnapshotTermFieldReader) postingToTermFieldDoc(next segment.Postin | |||
| } | |||
| if i.includeTermVectors { | |||
| locs := next.Locations() | |||
| rv.Vectors = make([]*index.TermFieldVector, len(locs)) | |||
| if cap(rv.Vectors) < len(locs) { | |||
| rv.Vectors = make([]*index.TermFieldVector, len(locs)) | |||
| backing := make([]index.TermFieldVector, len(locs)) | |||
| for i := range backing { | |||
| rv.Vectors[i] = &backing[i] | |||
| } | |||
| } | |||
| rv.Vectors = rv.Vectors[:len(locs)] | |||
| for i, loc := range locs { | |||
| rv.Vectors[i] = &index.TermFieldVector{ | |||
| *rv.Vectors[i] = index.TermFieldVector{ | |||
| Start: loc.Start(), | |||
| End: loc.End(), | |||
| Pos: loc.Pos(), | |||
| @@ -96,24 +135,37 @@ func (i *IndexSnapshotTermFieldReader) Advance(ID index.IndexInternalID, preAllo | |||
| } | |||
| *i = *(i2.(*IndexSnapshotTermFieldReader)) | |||
| } | |||
| // FIXME do something better | |||
| next, err := i.Next(preAlloced) | |||
| num, err := docInternalToNumber(ID) | |||
| if err != nil { | |||
| return nil, fmt.Errorf("error converting to doc number % x - %v", ID, err) | |||
| } | |||
| segIndex, ldocNum := i.snapshot.segmentIndexAndLocalDocNumFromGlobal(num) | |||
| if segIndex >= len(i.snapshot.segment) { | |||
| return nil, fmt.Errorf("computed segment index %d out of bounds %d", | |||
| segIndex, len(i.snapshot.segment)) | |||
| } | |||
| // skip directly to the target segment | |||
| i.segmentOffset = segIndex | |||
| next, err := i.iterators[i.segmentOffset].Advance(ldocNum) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| if next == nil { | |||
| return nil, nil | |||
| // we jumped directly to the segment that should have contained it | |||
| // but it wasn't there, so reuse Next() which should correctly | |||
| // get the next hit after it (we moved i.segmentOffset) | |||
| return i.Next(preAlloced) | |||
| } | |||
| for bytes.Compare(next.ID, ID) < 0 { | |||
| next, err = i.Next(preAlloced) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| if next == nil { | |||
| break | |||
| } | |||
| if preAlloced == nil { | |||
| preAlloced = &index.TermFieldDoc{} | |||
| } | |||
| return next, nil | |||
| preAlloced.ID = docNumberToBytes(preAlloced.ID, next.Number()+ | |||
| i.snapshot.offsets[segIndex]) | |||
| i.postingToTermFieldDoc(next, preAlloced) | |||
| i.currID = preAlloced.ID | |||
| i.currPosting = next | |||
| return preAlloced, nil | |||
| } | |||
| func (i *IndexSnapshotTermFieldReader) Count() uint64 { | |||
| @@ -126,7 +178,8 @@ func (i *IndexSnapshotTermFieldReader) Count() uint64 { | |||
| func (i *IndexSnapshotTermFieldReader) Close() error { | |||
| if i.snapshot != nil { | |||
| atomic.AddUint64(&i.snapshot.parent.stats.termSearchersFinished, uint64(1)) | |||
| atomic.AddUint64(&i.snapshot.parent.stats.TotTermSearchersFinished, uint64(1)) | |||
| i.snapshot.recycleTermFieldReader(i) | |||
| } | |||
| return nil | |||
| } | |||
| @@ -19,7 +19,7 @@ import ( | |||
| "log" | |||
| "github.com/blevesearch/bleve/index/scorch/segment" | |||
| "github.com/boltdb/bolt" | |||
| bolt "github.com/etcd-io/bbolt" | |||
| ) | |||
| type RollbackPoint struct { | |||
| @@ -15,42 +15,25 @@ | |||
| package scorch | |||
| import ( | |||
| "bytes" | |||
| "sync" | |||
| "sync/atomic" | |||
| "github.com/RoaringBitmap/roaring" | |||
| "github.com/blevesearch/bleve/index" | |||
| "github.com/blevesearch/bleve/index/scorch/segment" | |||
| "github.com/blevesearch/bleve/size" | |||
| ) | |||
| var TermSeparator byte = 0xff | |||
| var TermSeparatorSplitSlice = []byte{TermSeparator} | |||
| type SegmentDictionarySnapshot struct { | |||
| s *SegmentSnapshot | |||
| d segment.TermDictionary | |||
| } | |||
| func (s *SegmentDictionarySnapshot) PostingsList(term string, except *roaring.Bitmap) (segment.PostingsList, error) { | |||
| // TODO: if except is non-nil, perhaps need to OR it with s.s.deleted? | |||
| return s.d.PostingsList(term, s.s.deleted) | |||
| } | |||
| func (s *SegmentDictionarySnapshot) Iterator() segment.DictionaryIterator { | |||
| return s.d.Iterator() | |||
| } | |||
| func (s *SegmentDictionarySnapshot) PrefixIterator(prefix string) segment.DictionaryIterator { | |||
| return s.d.PrefixIterator(prefix) | |||
| } | |||
| func (s *SegmentDictionarySnapshot) RangeIterator(start, end string) segment.DictionaryIterator { | |||
| return s.d.RangeIterator(start, end) | |||
| } | |||
| type SegmentSnapshot struct { | |||
| id uint64 | |||
| segment segment.Segment | |||
| deleted *roaring.Bitmap | |||
| creator string | |||
| cachedDocs *cachedDocs | |||
| } | |||
| @@ -83,8 +66,11 @@ func (s *SegmentSnapshot) VisitDocument(num uint64, visitor segment.DocumentFiel | |||
| return s.segment.VisitDocument(num, visitor) | |||
| } | |||
| func (s *SegmentSnapshot) Count() uint64 { | |||
| func (s *SegmentSnapshot) DocID(num uint64) ([]byte, error) { | |||
| return s.segment.DocID(num) | |||
| } | |||
| func (s *SegmentSnapshot) Count() uint64 { | |||
| rv := s.segment.Count() | |||
| if s.deleted != nil { | |||
| rv -= s.deleted.GetCardinality() | |||
| @@ -92,17 +78,6 @@ func (s *SegmentSnapshot) Count() uint64 { | |||
| return rv | |||
| } | |||
| func (s *SegmentSnapshot) Dictionary(field string) (segment.TermDictionary, error) { | |||
| d, err := s.segment.Dictionary(field) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| return &SegmentDictionarySnapshot{ | |||
| s: s, | |||
| d: d, | |||
| }, nil | |||
| } | |||
| func (s *SegmentSnapshot) DocNumbers(docIDs []string) (*roaring.Bitmap, error) { | |||
| rv, err := s.segment.DocNumbers(docIDs) | |||
| if err != nil { | |||
| @@ -114,7 +89,7 @@ func (s *SegmentSnapshot) DocNumbers(docIDs []string) (*roaring.Bitmap, error) { | |||
| return rv, nil | |||
| } | |||
| // DocNumbersLive returns bitsit containing doc numbers for all live docs | |||
| // DocNumbersLive returns a bitmap containing doc numbers for all live docs | |||
| func (s *SegmentSnapshot) DocNumbersLive() *roaring.Bitmap { | |||
| rv := roaring.NewBitmap() | |||
| rv.AddRange(0, s.segment.Count()) | |||
| @@ -128,36 +103,68 @@ func (s *SegmentSnapshot) Fields() []string { | |||
| return s.segment.Fields() | |||
| } | |||
| func (s *SegmentSnapshot) Size() (rv int) { | |||
| rv = s.segment.Size() | |||
| if s.deleted != nil { | |||
| rv += int(s.deleted.GetSizeInBytes()) | |||
| } | |||
| rv += s.cachedDocs.Size() | |||
| return | |||
| } | |||
| type cachedFieldDocs struct { | |||
| m sync.Mutex | |||
| readyCh chan struct{} // closed when the cachedFieldDocs.docs is ready to be used. | |||
| err error // Non-nil if there was an error when preparing this cachedFieldDocs. | |||
| docs map[uint64][]byte // Keyed by localDocNum, value is a list of terms delimited by 0xFF. | |||
| size uint64 | |||
| } | |||
| func (cfd *cachedFieldDocs) prepareFields(field string, ss *SegmentSnapshot) { | |||
| defer close(cfd.readyCh) | |||
| func (cfd *cachedFieldDocs) Size() int { | |||
| var rv int | |||
| cfd.m.Lock() | |||
| for _, entry := range cfd.docs { | |||
| rv += 8 /* size of uint64 */ + len(entry) | |||
| } | |||
| cfd.m.Unlock() | |||
| return rv | |||
| } | |||
| func (cfd *cachedFieldDocs) prepareField(field string, ss *SegmentSnapshot) { | |||
| cfd.m.Lock() | |||
| defer func() { | |||
| close(cfd.readyCh) | |||
| cfd.m.Unlock() | |||
| }() | |||
| cfd.size += uint64(size.SizeOfUint64) /* size field */ | |||
| dict, err := ss.segment.Dictionary(field) | |||
| if err != nil { | |||
| cfd.err = err | |||
| return | |||
| } | |||
| var postings segment.PostingsList | |||
| var postingsItr segment.PostingsIterator | |||
| dictItr := dict.Iterator() | |||
| next, err := dictItr.Next() | |||
| for err == nil && next != nil { | |||
| postings, err1 := dict.PostingsList(next.Term, nil) | |||
| var err1 error | |||
| postings, err1 = dict.PostingsList([]byte(next.Term), nil, postings) | |||
| if err1 != nil { | |||
| cfd.err = err1 | |||
| return | |||
| } | |||
| postingsItr := postings.Iterator() | |||
| cfd.size += uint64(size.SizeOfUint64) /* map key */ | |||
| postingsItr = postings.Iterator(false, false, false, postingsItr) | |||
| nextPosting, err2 := postingsItr.Next() | |||
| for err2 == nil && nextPosting != nil { | |||
| docNum := nextPosting.Number() | |||
| cfd.docs[docNum] = append(cfd.docs[docNum], []byte(next.Term)...) | |||
| cfd.docs[docNum] = append(cfd.docs[docNum], TermSeparator) | |||
| cfd.size += uint64(len(next.Term) + 1) // map value | |||
| nextPosting, err2 = postingsItr.Next() | |||
| } | |||
| @@ -178,10 +185,12 @@ func (cfd *cachedFieldDocs) prepareFields(field string, ss *SegmentSnapshot) { | |||
| type cachedDocs struct { | |||
| m sync.Mutex // As the cache is asynchronously prepared, need a lock | |||
| cache map[string]*cachedFieldDocs // Keyed by field | |||
| size uint64 | |||
| } | |||
| func (c *cachedDocs) prepareFields(wantedFields []string, ss *SegmentSnapshot) error { | |||
| c.m.Lock() | |||
| if c.cache == nil { | |||
| c.cache = make(map[string]*cachedFieldDocs, len(ss.Fields())) | |||
| } | |||
| @@ -194,7 +203,7 @@ func (c *cachedDocs) prepareFields(wantedFields []string, ss *SegmentSnapshot) e | |||
| docs: make(map[uint64][]byte), | |||
| } | |||
| go c.cache[field].prepareFields(field, ss) | |||
| go c.cache[field].prepareField(field, ss) | |||
| } | |||
| } | |||
| @@ -209,21 +218,62 @@ func (c *cachedDocs) prepareFields(wantedFields []string, ss *SegmentSnapshot) e | |||
| c.m.Lock() | |||
| } | |||
| c.updateSizeLOCKED() | |||
| c.m.Unlock() | |||
| return nil | |||
| } | |||
| func (c *cachedDocs) sizeInBytes() uint64 { | |||
| sizeInBytes := 0 | |||
| // hasFields returns true if the cache has all the given fields | |||
| func (c *cachedDocs) hasFields(fields []string) bool { | |||
| c.m.Lock() | |||
| for _, field := range fields { | |||
| if _, exists := c.cache[field]; !exists { | |||
| c.m.Unlock() | |||
| return false // found a field not in cache | |||
| } | |||
| } | |||
| c.m.Unlock() | |||
| return true | |||
| } | |||
| func (c *cachedDocs) Size() int { | |||
| return int(atomic.LoadUint64(&c.size)) | |||
| } | |||
| func (c *cachedDocs) updateSizeLOCKED() { | |||
| sizeInBytes := 0 | |||
| for k, v := range c.cache { // cachedFieldDocs | |||
| sizeInBytes += len(k) | |||
| if v != nil { | |||
| for _, entry := range v.docs { // docs | |||
| sizeInBytes += 8 /* size of uint64 */ + len(entry) | |||
| sizeInBytes += v.Size() | |||
| } | |||
| } | |||
| atomic.StoreUint64(&c.size, uint64(sizeInBytes)) | |||
| } | |||
| func (c *cachedDocs) visitDoc(localDocNum uint64, | |||
| fields []string, visitor index.DocumentFieldTermVisitor) { | |||
| c.m.Lock() | |||
| for _, field := range fields { | |||
| if cachedFieldDocs, exists := c.cache[field]; exists { | |||
| c.m.Unlock() | |||
| <-cachedFieldDocs.readyCh | |||
| c.m.Lock() | |||
| if tlist, exists := cachedFieldDocs.docs[localDocNum]; exists { | |||
| for { | |||
| i := bytes.Index(tlist, TermSeparatorSplitSlice) | |||
| if i < 0 { | |||
| break | |||
| } | |||
| visitor(field, tlist[0:i]) | |||
| tlist = tlist[i+1:] | |||
| } | |||
| } | |||
| } | |||
| } | |||
| c.m.Unlock() | |||
| return uint64(sizeInBytes) | |||
| } | |||
| @@ -16,63 +16,125 @@ package scorch | |||
| import ( | |||
| "encoding/json" | |||
| "io/ioutil" | |||
| "reflect" | |||
| "sync/atomic" | |||
| ) | |||
| // Stats tracks statistics about the index | |||
| // Stats tracks statistics about the index, fields that are | |||
| // prefixed like CurXxxx are gauges (can go up and down), | |||
| // and fields that are prefixed like TotXxxx are monotonically | |||
| // increasing counters. | |||
| type Stats struct { | |||
| updates, deletes, batches, errors uint64 | |||
| analysisTime, indexTime uint64 | |||
| termSearchersStarted uint64 | |||
| termSearchersFinished uint64 | |||
| numPlainTextBytesIndexed uint64 | |||
| numItemsIntroduced uint64 | |||
| numItemsPersisted uint64 | |||
| i *Scorch | |||
| } | |||
| TotUpdates uint64 | |||
| TotDeletes uint64 | |||
| func (s *Stats) statsMap() (map[string]interface{}, error) { | |||
| m := map[string]interface{}{} | |||
| m["updates"] = atomic.LoadUint64(&s.updates) | |||
| m["deletes"] = atomic.LoadUint64(&s.deletes) | |||
| m["batches"] = atomic.LoadUint64(&s.batches) | |||
| m["errors"] = atomic.LoadUint64(&s.errors) | |||
| m["analysis_time"] = atomic.LoadUint64(&s.analysisTime) | |||
| m["index_time"] = atomic.LoadUint64(&s.indexTime) | |||
| m["term_searchers_started"] = atomic.LoadUint64(&s.termSearchersStarted) | |||
| m["term_searchers_finished"] = atomic.LoadUint64(&s.termSearchersFinished) | |||
| m["num_plain_text_bytes_indexed"] = atomic.LoadUint64(&s.numPlainTextBytesIndexed) | |||
| m["num_items_introduced"] = atomic.LoadUint64(&s.numItemsIntroduced) | |||
| m["num_items_persisted"] = atomic.LoadUint64(&s.numItemsPersisted) | |||
| if s.i.path != "" { | |||
| finfos, err := ioutil.ReadDir(s.i.path) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| TotBatches uint64 | |||
| TotBatchesEmpty uint64 | |||
| TotBatchIntroTime uint64 | |||
| MaxBatchIntroTime uint64 | |||
| var numFilesOnDisk, numBytesUsedDisk uint64 | |||
| CurRootEpoch uint64 | |||
| LastPersistedEpoch uint64 | |||
| LastMergedEpoch uint64 | |||
| for _, finfo := range finfos { | |||
| if !finfo.IsDir() { | |||
| numBytesUsedDisk += uint64(finfo.Size()) | |||
| numFilesOnDisk++ | |||
| } | |||
| } | |||
| TotOnErrors uint64 | |||
| m["num_bytes_used_disk"] = numBytesUsedDisk | |||
| m["num_files_on_disk"] = numFilesOnDisk | |||
| } | |||
| TotAnalysisTime uint64 | |||
| TotIndexTime uint64 | |||
| TotIndexedPlainTextBytes uint64 | |||
| TotTermSearchersStarted uint64 | |||
| TotTermSearchersFinished uint64 | |||
| TotIntroduceLoop uint64 | |||
| TotIntroduceSegmentBeg uint64 | |||
| TotIntroduceSegmentEnd uint64 | |||
| TotIntroducePersistBeg uint64 | |||
| TotIntroducePersistEnd uint64 | |||
| TotIntroduceMergeBeg uint64 | |||
| TotIntroduceMergeEnd uint64 | |||
| TotIntroduceRevertBeg uint64 | |||
| TotIntroduceRevertEnd uint64 | |||
| TotIntroducedItems uint64 | |||
| TotIntroducedSegmentsBatch uint64 | |||
| TotIntroducedSegmentsMerge uint64 | |||
| TotPersistLoopBeg uint64 | |||
| TotPersistLoopErr uint64 | |||
| TotPersistLoopProgress uint64 | |||
| TotPersistLoopWait uint64 | |||
| TotPersistLoopWaitNotified uint64 | |||
| TotPersistLoopEnd uint64 | |||
| TotPersistedItems uint64 | |||
| TotItemsToPersist uint64 | |||
| TotPersistedSegments uint64 | |||
| TotPersisterSlowMergerPause uint64 | |||
| TotPersisterSlowMergerResume uint64 | |||
| TotPersisterNapPauseCompleted uint64 | |||
| TotPersisterMergerNapBreak uint64 | |||
| return m, nil | |||
| TotFileMergeLoopBeg uint64 | |||
| TotFileMergeLoopErr uint64 | |||
| TotFileMergeLoopEnd uint64 | |||
| TotFileMergePlan uint64 | |||
| TotFileMergePlanErr uint64 | |||
| TotFileMergePlanNone uint64 | |||
| TotFileMergePlanOk uint64 | |||
| TotFileMergePlanTasks uint64 | |||
| TotFileMergePlanTasksDone uint64 | |||
| TotFileMergePlanTasksErr uint64 | |||
| TotFileMergePlanTasksSegments uint64 | |||
| TotFileMergePlanTasksSegmentsEmpty uint64 | |||
| TotFileMergeSegmentsEmpty uint64 | |||
| TotFileMergeSegments uint64 | |||
| TotFileSegmentsAtRoot uint64 | |||
| TotFileMergeWrittenBytes uint64 | |||
| TotFileMergeZapBeg uint64 | |||
| TotFileMergeZapEnd uint64 | |||
| TotFileMergeZapTime uint64 | |||
| MaxFileMergeZapTime uint64 | |||
| TotFileMergeIntroductions uint64 | |||
| TotFileMergeIntroductionsDone uint64 | |||
| TotFileMergeIntroductionsSkipped uint64 | |||
| TotMemMergeBeg uint64 | |||
| TotMemMergeErr uint64 | |||
| TotMemMergeDone uint64 | |||
| TotMemMergeZapBeg uint64 | |||
| TotMemMergeZapEnd uint64 | |||
| TotMemMergeZapTime uint64 | |||
| MaxMemMergeZapTime uint64 | |||
| TotMemMergeSegments uint64 | |||
| TotMemorySegmentsAtRoot uint64 | |||
| } | |||
| // MarshalJSON implements json.Marshaler | |||
| func (s *Stats) MarshalJSON() ([]byte, error) { | |||
| m, err := s.statsMap() | |||
| if err != nil { | |||
| return nil, err | |||
| // atomically populates the returned map | |||
| func (s *Stats) ToMap() map[string]interface{} { | |||
| m := map[string]interface{}{} | |||
| sve := reflect.ValueOf(s).Elem() | |||
| svet := sve.Type() | |||
| for i := 0; i < svet.NumField(); i++ { | |||
| svef := sve.Field(i) | |||
| if svef.CanAddr() { | |||
| svefp := svef.Addr().Interface() | |||
| m[svet.Field(i).Name] = atomic.LoadUint64(svefp.(*uint64)) | |||
| } | |||
| } | |||
| return json.Marshal(m) | |||
| return m | |||
| } | |||
| // MarshalJSON implements json.Marshaler, and in contrast to standard | |||
| // json marshaling provides atomic safety | |||
| func (s *Stats) MarshalJSON() ([]byte, error) { | |||
| return json.Marshal(s.ToMap()) | |||
| } | |||
| @@ -17,7 +17,7 @@ package boltdb | |||
| import ( | |||
| "bytes" | |||
| "github.com/boltdb/bolt" | |||
| bolt "github.com/etcd-io/bbolt" | |||
| ) | |||
| type Iterator struct { | |||
| @@ -16,7 +16,7 @@ package boltdb | |||
| import ( | |||
| "github.com/blevesearch/bleve/index/store" | |||
| "github.com/boltdb/bolt" | |||
| bolt "github.com/etcd-io/bbolt" | |||
| ) | |||
| type Reader struct { | |||
| @@ -30,7 +30,7 @@ import ( | |||
| "github.com/blevesearch/bleve/index/store" | |||
| "github.com/blevesearch/bleve/registry" | |||
| "github.com/boltdb/bolt" | |||
| bolt "github.com/etcd-io/bbolt" | |||
| ) | |||
| const ( | |||
| @@ -74,6 +74,12 @@ func New(mo store.MergeOperator, config map[string]interface{}) (store.KVStore, | |||
| bo.ReadOnly = ro | |||
| } | |||
| if initialMmapSize, ok := config["initialMmapSize"].(int); ok { | |||
| bo.InitialMmapSize = initialMmapSize | |||
| } else if initialMmapSize, ok := config["initialMmapSize"].(float64); ok { | |||
| bo.InitialMmapSize = int(initialMmapSize) | |||
| } | |||
| db, err := bolt.Open(path, 0600, bo) | |||
| if err != nil { | |||
| return nil, err | |||
| @@ -15,11 +15,20 @@ | |||
| package upsidedown | |||
| import ( | |||
| "reflect" | |||
| "github.com/blevesearch/bleve/document" | |||
| "github.com/blevesearch/bleve/index" | |||
| "github.com/blevesearch/bleve/index/store" | |||
| ) | |||
| var reflectStaticSizeIndexReader int | |||
| func init() { | |||
| var ir IndexReader | |||
| reflectStaticSizeIndexReader = int(reflect.TypeOf(ir).Size()) | |||
| } | |||
| type IndexReader struct { | |||
| index *UpsideDownCouch | |||
| kvreader store.KVReader | |||
| @@ -201,3 +210,17 @@ func incrementBytes(in []byte) []byte { | |||
| } | |||
| return rv | |||
| } | |||
| func (i *IndexReader) DocValueReader(fields []string) (index.DocValueReader, error) { | |||
| return &DocValueReader{i: i, fields: fields}, nil | |||
| } | |||
| type DocValueReader struct { | |||
| i *IndexReader | |||
| fields []string | |||
| } | |||
| func (dvr *DocValueReader) VisitDocValues(id index.IndexInternalID, | |||
| visitor index.DocumentFieldTermVisitor) error { | |||
| return dvr.i.DocumentVisitFieldTerms(id, dvr.fields, visitor) | |||
| } | |||
| @@ -16,13 +16,27 @@ package upsidedown | |||
| import ( | |||
| "bytes" | |||
| "reflect" | |||
| "sort" | |||
| "sync/atomic" | |||
| "github.com/blevesearch/bleve/index" | |||
| "github.com/blevesearch/bleve/index/store" | |||
| "github.com/blevesearch/bleve/size" | |||
| ) | |||
| var reflectStaticSizeUpsideDownCouchTermFieldReader int | |||
| var reflectStaticSizeUpsideDownCouchDocIDReader int | |||
| func init() { | |||
| var tfr UpsideDownCouchTermFieldReader | |||
| reflectStaticSizeUpsideDownCouchTermFieldReader = | |||
| int(reflect.TypeOf(tfr).Size()) | |||
| var cdr UpsideDownCouchDocIDReader | |||
| reflectStaticSizeUpsideDownCouchDocIDReader = | |||
| int(reflect.TypeOf(cdr).Size()) | |||
| } | |||
| type UpsideDownCouchTermFieldReader struct { | |||
| count uint64 | |||
| indexReader *IndexReader | |||
| @@ -35,6 +49,19 @@ type UpsideDownCouchTermFieldReader struct { | |||
| includeTermVectors bool | |||
| } | |||
| func (r *UpsideDownCouchTermFieldReader) Size() int { | |||
| sizeInBytes := reflectStaticSizeUpsideDownCouchTermFieldReader + size.SizeOfPtr + | |||
| len(r.term) + | |||
| r.tfrPrealloc.Size() + | |||
| len(r.keyBuf) | |||
| if r.tfrNext != nil { | |||
| sizeInBytes += r.tfrNext.Size() | |||
| } | |||
| return sizeInBytes | |||
| } | |||
| func newUpsideDownCouchTermFieldReader(indexReader *IndexReader, term []byte, field uint16, includeFreq, includeNorm, includeTermVectors bool) (*UpsideDownCouchTermFieldReader, error) { | |||
| bufNeeded := termFrequencyRowKeySize(term, nil) | |||
| if bufNeeded < dictionaryRowKeySize(term) { | |||
| @@ -174,8 +201,18 @@ type UpsideDownCouchDocIDReader struct { | |||
| onlyMode bool | |||
| } | |||
| func newUpsideDownCouchDocIDReader(indexReader *IndexReader) (*UpsideDownCouchDocIDReader, error) { | |||
| func (r *UpsideDownCouchDocIDReader) Size() int { | |||
| sizeInBytes := reflectStaticSizeUpsideDownCouchDocIDReader + | |||
| reflectStaticSizeIndexReader + size.SizeOfPtr | |||
| for _, entry := range r.only { | |||
| sizeInBytes += size.SizeOfString + len(entry) | |||
| } | |||
| return sizeInBytes | |||
| } | |||
| func newUpsideDownCouchDocIDReader(indexReader *IndexReader) (*UpsideDownCouchDocIDReader, error) { | |||
| startBytes := []byte{0x0} | |||
| endBytes := []byte{0xff} | |||
| @@ -20,10 +20,22 @@ import ( | |||
| "fmt" | |||
| "io" | |||
| "math" | |||
| "reflect" | |||
| "github.com/blevesearch/bleve/size" | |||
| "github.com/golang/protobuf/proto" | |||
| ) | |||
| var reflectStaticSizeTermFrequencyRow int | |||
| var reflectStaticSizeTermVector int | |||
| func init() { | |||
| var tfr TermFrequencyRow | |||
| reflectStaticSizeTermFrequencyRow = int(reflect.TypeOf(tfr).Size()) | |||
| var tv TermVector | |||
| reflectStaticSizeTermVector = int(reflect.TypeOf(tv).Size()) | |||
| } | |||
| const ByteSeparator byte = 0xff | |||
| type UpsideDownCouchRowStream chan UpsideDownCouchRow | |||
| @@ -358,6 +370,11 @@ type TermVector struct { | |||
| end uint64 | |||
| } | |||
| func (tv *TermVector) Size() int { | |||
| return reflectStaticSizeTermVector + size.SizeOfPtr + | |||
| len(tv.arrayPositions)*size.SizeOfUint64 | |||
| } | |||
| func (tv *TermVector) String() string { | |||
| return fmt.Sprintf("Field: %d Pos: %d Start: %d End %d ArrayPositions: %#v", tv.field, tv.pos, tv.start, tv.end, tv.arrayPositions) | |||
| } | |||
| @@ -371,6 +388,18 @@ type TermFrequencyRow struct { | |||
| field uint16 | |||
| } | |||
| func (tfr *TermFrequencyRow) Size() int { | |||
| sizeInBytes := reflectStaticSizeTermFrequencyRow + | |||
| len(tfr.term) + | |||
| len(tfr.doc) | |||
| for _, entry := range tfr.vectors { | |||
| sizeInBytes += entry.Size() | |||
| } | |||
| return sizeInBytes | |||
| } | |||
| func (tfr *TermFrequencyRow) Term() []byte { | |||
| return tfr.term | |||
| } | |||
| @@ -555,7 +584,7 @@ func (tfr *TermFrequencyRow) parseK(key []byte) error { | |||
| func (tfr *TermFrequencyRow) parseKDoc(key []byte, term []byte) error { | |||
| tfr.doc = key[3+len(term)+1:] | |||
| if len(tfr.doc) <= 0 { | |||
| if len(tfr.doc) == 0 { | |||
| return fmt.Errorf("invalid term frequency key, empty docid") | |||
| } | |||
| @@ -775,7 +775,7 @@ func (udc *UpsideDownCouch) termVectorsFromTokenFreq(field uint16, tf *analysis. | |||
| } | |||
| func (udc *UpsideDownCouch) termFieldVectorsFromTermVectors(in []*TermVector) []*index.TermFieldVector { | |||
| if len(in) <= 0 { | |||
| if len(in) == 0 { | |||
| return nil | |||
| } | |||
| @@ -810,15 +810,17 @@ func (udc *UpsideDownCouch) Batch(batch *index.Batch) (err error) { | |||
| } | |||
| } | |||
| go func() { | |||
| for _, doc := range batch.IndexOps { | |||
| if doc != nil { | |||
| aw := index.NewAnalysisWork(udc, doc, resultChan) | |||
| // put the work on the queue | |||
| udc.analysisQueue.Queue(aw) | |||
| if len(batch.IndexOps) > 0 { | |||
| go func() { | |||
| for _, doc := range batch.IndexOps { | |||
| if doc != nil { | |||
| aw := index.NewAnalysisWork(udc, doc, resultChan) | |||
| // put the work on the queue | |||
| udc.analysisQueue.Queue(aw) | |||
| } | |||
| } | |||
| } | |||
| }() | |||
| }() | |||
| } | |||
| // retrieve back index rows concurrent with analysis | |||
| docBackIndexRowErr := error(nil) | |||
| @@ -958,6 +960,11 @@ func (udc *UpsideDownCouch) Batch(batch *index.Batch) (err error) { | |||
| } else { | |||
| atomic.AddUint64(&udc.stats.errors, 1) | |||
| } | |||
| persistedCallback := batch.PersistedCallback() | |||
| if persistedCallback != nil { | |||
| persistedCallback(err) | |||
| } | |||
| return | |||
| } | |||
| @@ -433,6 +433,7 @@ func createChildSearchRequest(req *SearchRequest) *SearchRequest { | |||
| Explain: req.Explain, | |||
| Sort: req.Sort.Copy(), | |||
| IncludeLocations: req.IncludeLocations, | |||
| Score: req.Score, | |||
| } | |||
| return &rv | |||
| } | |||
| @@ -50,6 +50,12 @@ const storePath = "store" | |||
| var mappingInternalKey = []byte("_mapping") | |||
| const SearchQueryStartCallbackKey = "_search_query_start_callback_key" | |||
| const SearchQueryEndCallbackKey = "_search_query_end_callback_key" | |||
| type SearchQueryStartCallbackFn func(size uint64) error | |||
| type SearchQueryEndCallbackFn func(size uint64) error | |||
| func indexStorePath(path string) string { | |||
| return path + string(os.PathSeparator) + storePath | |||
| } | |||
| @@ -362,8 +368,70 @@ func (i *indexImpl) Search(req *SearchRequest) (sr *SearchResult, err error) { | |||
| return i.SearchInContext(context.Background(), req) | |||
| } | |||
| var documentMatchEmptySize int | |||
| var searchContextEmptySize int | |||
| var facetResultEmptySize int | |||
| var documentEmptySize int | |||
| func init() { | |||
| var dm search.DocumentMatch | |||
| documentMatchEmptySize = dm.Size() | |||
| var sc search.SearchContext | |||
| searchContextEmptySize = sc.Size() | |||
| var fr search.FacetResult | |||
| facetResultEmptySize = fr.Size() | |||
| var d document.Document | |||
| documentEmptySize = d.Size() | |||
| } | |||
| // memNeededForSearch is a helper function that returns an estimate of RAM | |||
| // needed to execute a search request. | |||
| func memNeededForSearch(req *SearchRequest, | |||
| searcher search.Searcher, | |||
| topnCollector *collector.TopNCollector) uint64 { | |||
| backingSize := req.Size + req.From + 1 | |||
| if req.Size+req.From > collector.PreAllocSizeSkipCap { | |||
| backingSize = collector.PreAllocSizeSkipCap + 1 | |||
| } | |||
| numDocMatches := backingSize + searcher.DocumentMatchPoolSize() | |||
| estimate := 0 | |||
| // overhead, size in bytes from collector | |||
| estimate += topnCollector.Size() | |||
| // pre-allocing DocumentMatchPool | |||
| estimate += searchContextEmptySize + numDocMatches*documentMatchEmptySize | |||
| // searcher overhead | |||
| estimate += searcher.Size() | |||
| // overhead from results, lowestMatchOutsideResults | |||
| estimate += (numDocMatches + 1) * documentMatchEmptySize | |||
| // additional overhead from SearchResult | |||
| estimate += reflectStaticSizeSearchResult + reflectStaticSizeSearchStatus | |||
| // overhead from facet results | |||
| if req.Facets != nil { | |||
| estimate += len(req.Facets) * facetResultEmptySize | |||
| } | |||
| // highlighting, store | |||
| if len(req.Fields) > 0 || req.Highlight != nil { | |||
| // Size + From => number of hits | |||
| estimate += (req.Size + req.From) * documentEmptySize | |||
| } | |||
| return uint64(estimate) | |||
| } | |||
| // SearchInContext executes a search request operation within the provided | |||
| // Context. Returns a SearchResult object or an error. | |||
| // Context. Returns a SearchResult object or an error. | |||
| func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr *SearchResult, err error) { | |||
| i.mutex.RLock() | |||
| defer i.mutex.RUnlock() | |||
| @@ -390,6 +458,7 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr | |||
| searcher, err := req.Query.Searcher(indexReader, i.m, search.SearcherOptions{ | |||
| Explain: req.Explain, | |||
| IncludeTermVectors: req.IncludeLocations || req.Highlight != nil, | |||
| Score: req.Score, | |||
| }) | |||
| if err != nil { | |||
| return nil, err | |||
| @@ -428,6 +497,24 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr | |||
| collector.SetFacetsBuilder(facetsBuilder) | |||
| } | |||
| memNeeded := memNeededForSearch(req, searcher, collector) | |||
| if cb := ctx.Value(SearchQueryStartCallbackKey); cb != nil { | |||
| if cbF, ok := cb.(SearchQueryStartCallbackFn); ok { | |||
| err = cbF(memNeeded) | |||
| } | |||
| } | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| if cb := ctx.Value(SearchQueryEndCallbackKey); cb != nil { | |||
| if cbF, ok := cb.(SearchQueryEndCallbackFn); ok { | |||
| defer func() { | |||
| _ = cbF(memNeeded) | |||
| }() | |||
| } | |||
| } | |||
| err = collector.Collect(ctx, searcher, indexReader) | |||
| if err != nil { | |||
| return nil, err | |||
| @@ -459,7 +546,8 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr | |||
| doc, err := indexReader.Document(hit.ID) | |||
| if err == nil && doc != nil { | |||
| if len(req.Fields) > 0 { | |||
| for _, f := range req.Fields { | |||
| fieldsToLoad := deDuplicate(req.Fields) | |||
| for _, f := range fieldsToLoad { | |||
| for _, docF := range doc.Fields { | |||
| if f == "*" || docF.Name() == f { | |||
| var value interface{} | |||
| @@ -533,9 +621,7 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr | |||
| return &SearchResult{ | |||
| Status: &SearchStatus{ | |||
| Total: 1, | |||
| Failed: 0, | |||
| Successful: 1, | |||
| Errors: make(map[string]error), | |||
| }, | |||
| Request: req, | |||
| Hits: hits, | |||
| @@ -755,3 +841,16 @@ func (f *indexImplFieldDict) Close() error { | |||
| } | |||
| return f.indexReader.Close() | |||
| } | |||
| // helper function to remove duplicate entries from slice of strings | |||
| func deDuplicate(fields []string) []string { | |||
| entries := make(map[string]struct{}) | |||
| ret := []string{} | |||
| for _, entry := range fields { | |||
| if _, exists := entries[entry]; !exists { | |||
| entries[entry] = struct{}{} | |||
| ret = append(ret, entry) | |||
| } | |||
| } | |||
| return ret | |||
| } | |||
| @@ -18,6 +18,7 @@ import ( | |||
| "encoding/json" | |||
| "io/ioutil" | |||
| "os" | |||
| "path/filepath" | |||
| "github.com/blevesearch/bleve/index/upsidedown" | |||
| ) | |||
| @@ -92,5 +93,5 @@ func (i *indexMeta) Save(path string) (err error) { | |||
| } | |||
| func indexMetaPath(path string) string { | |||
| return path + string(os.PathSeparator) + metaFilename | |||
| return filepath.Join(path, metaFilename) | |||
| } | |||
| @@ -42,7 +42,7 @@ type DocumentMapping struct { | |||
| Dynamic bool `json:"dynamic"` | |||
| Properties map[string]*DocumentMapping `json:"properties,omitempty"` | |||
| Fields []*FieldMapping `json:"fields,omitempty"` | |||
| DefaultAnalyzer string `json:"default_analyzer"` | |||
| DefaultAnalyzer string `json:"default_analyzer,omitempty"` | |||
| // StructTagKey overrides "json" when looking for field names in struct tags | |||
| StructTagKey string `json:"struct_tag_key,omitempty"` | |||
| @@ -324,13 +324,17 @@ func (dm *DocumentMapping) defaultAnalyzerName(path []string) string { | |||
| } | |||
| func (dm *DocumentMapping) walkDocument(data interface{}, path []string, indexes []uint64, context *walkContext) { | |||
| // allow default "json" tag to be overriden | |||
| // allow default "json" tag to be overridden | |||
| structTagKey := dm.StructTagKey | |||
| if structTagKey == "" { | |||
| structTagKey = "json" | |||
| } | |||
| val := reflect.ValueOf(data) | |||
| if !val.IsValid() { | |||
| return | |||
| } | |||
| typ := val.Type() | |||
| switch typ.Kind() { | |||
| case reflect.Map: | |||
| @@ -420,7 +424,11 @@ func (dm *DocumentMapping) processProperty(property interface{}, path []string, | |||
| if subDocMapping != nil { | |||
| // index by explicit mapping | |||
| for _, fieldMapping := range subDocMapping.Fields { | |||
| fieldMapping.processString(propertyValueString, pathString, path, indexes, context) | |||
| if fieldMapping.Type == "geopoint" { | |||
| fieldMapping.processGeoPoint(property, pathString, path, indexes, context) | |||
| } else { | |||
| fieldMapping.processString(propertyValueString, pathString, path, indexes, context) | |||
| } | |||
| } | |||
| } else if closestDocMapping.Dynamic { | |||
| // automatic indexing behavior | |||
| @@ -320,8 +320,8 @@ func (im *IndexMappingImpl) determineType(data interface{}) string { | |||
| func (im *IndexMappingImpl) MapDocument(doc *document.Document, data interface{}) error { | |||
| docType := im.determineType(data) | |||
| docMapping := im.mappingForType(docType) | |||
| walkContext := im.newWalkContext(doc, docMapping) | |||
| if docMapping.Enabled { | |||
| walkContext := im.newWalkContext(doc, docMapping) | |||
| docMapping.walkDocument(data, []string{}, []uint64{}, walkContext) | |||
| // see if the _all field was disabled | |||
| @@ -35,6 +35,9 @@ func lookupPropertyPath(data interface{}, path string) interface{} { | |||
| func lookupPropertyPathPart(data interface{}, part string) interface{} { | |||
| val := reflect.ValueOf(data) | |||
| if !val.IsValid() { | |||
| return nil | |||
| } | |||
| typ := val.Type() | |||
| switch typ.Kind() { | |||
| case reflect.Map: | |||
| @@ -14,7 +14,7 @@ var interleaveShift = []uint{1, 2, 4, 8, 16} | |||
| // Interleave the first 32 bits of each uint64 | |||
| // apdated from org.apache.lucene.util.BitUtil | |||
| // whcih was adapted from: | |||
| // which was adapted from: | |||
| // http://graphics.stanford.edu/~seander/bithacks.html#InterleaveBMN | |||
| func Interleave(v1, v2 uint64) uint64 { | |||
| v1 = (v1 | (v1 << interleaveShift[4])) & interleaveMagic[4] | |||
| @@ -77,6 +77,10 @@ func (p PrefixCoded) Int64() (int64, error) { | |||
| } | |||
| func ValidPrefixCodedTerm(p string) (bool, int) { | |||
| return ValidPrefixCodedTermBytes([]byte(p)) | |||
| } | |||
| func ValidPrefixCodedTermBytes(p []byte) (bool, int) { | |||
| if len(p) > 0 { | |||
| if p[0] < ShiftStartInt64 || p[0] > ShiftStartInt64+63 { | |||
| return false, 0 | |||
| @@ -17,15 +17,29 @@ package bleve | |||
| import ( | |||
| "encoding/json" | |||
| "fmt" | |||
| "reflect" | |||
| "time" | |||
| "github.com/blevesearch/bleve/analysis" | |||
| "github.com/blevesearch/bleve/analysis/datetime/optional" | |||
| "github.com/blevesearch/bleve/document" | |||
| "github.com/blevesearch/bleve/registry" | |||
| "github.com/blevesearch/bleve/search" | |||
| "github.com/blevesearch/bleve/search/collector" | |||
| "github.com/blevesearch/bleve/search/query" | |||
| "github.com/blevesearch/bleve/size" | |||
| ) | |||
| var reflectStaticSizeSearchResult int | |||
| var reflectStaticSizeSearchStatus int | |||
| func init() { | |||
| var sr SearchResult | |||
| reflectStaticSizeSearchResult = int(reflect.TypeOf(sr).Size()) | |||
| var ss SearchStatus | |||
| reflectStaticSizeSearchStatus = int(reflect.TypeOf(ss).Size()) | |||
| } | |||
| var cache = registry.NewCache() | |||
| const defaultDateTimeParser = optional.Name | |||
| @@ -247,6 +261,7 @@ func (h *HighlightRequest) AddField(field string) { | |||
| // Explain triggers inclusion of additional search | |||
| // result score explanations. | |||
| // Sort describes the desired order for the results to be returned. | |||
| // Score controls the kind of scoring performed | |||
| // | |||
| // A special field named "*" can be used to return all fields. | |||
| type SearchRequest struct { | |||
| @@ -259,6 +274,7 @@ type SearchRequest struct { | |||
| Explain bool `json:"explain"` | |||
| Sort search.SortOrder `json:"sort"` | |||
| IncludeLocations bool `json:"includeLocations"` | |||
| Score string `json:"score,omitempty"` | |||
| } | |||
| func (r *SearchRequest) Validate() error { | |||
| @@ -308,6 +324,7 @@ func (r *SearchRequest) UnmarshalJSON(input []byte) error { | |||
| Explain bool `json:"explain"` | |||
| Sort []json.RawMessage `json:"sort"` | |||
| IncludeLocations bool `json:"includeLocations"` | |||
| Score string `json:"score"` | |||
| } | |||
| err := json.Unmarshal(input, &temp) | |||
| @@ -334,6 +351,7 @@ func (r *SearchRequest) UnmarshalJSON(input []byte) error { | |||
| r.Fields = temp.Fields | |||
| r.Facets = temp.Facets | |||
| r.IncludeLocations = temp.IncludeLocations | |||
| r.Score = temp.Score | |||
| r.Query, err = query.ParseQuery(temp.Q) | |||
| if err != nil { | |||
| return err | |||
| @@ -432,6 +450,24 @@ type SearchResult struct { | |||
| Facets search.FacetResults `json:"facets"` | |||
| } | |||
| func (sr *SearchResult) Size() int { | |||
| sizeInBytes := reflectStaticSizeSearchResult + size.SizeOfPtr + | |||
| reflectStaticSizeSearchStatus | |||
| for _, entry := range sr.Hits { | |||
| if entry != nil { | |||
| sizeInBytes += entry.Size() | |||
| } | |||
| } | |||
| for k, v := range sr.Facets { | |||
| sizeInBytes += size.SizeOfString + len(k) + | |||
| v.Size() | |||
| } | |||
| return sizeInBytes | |||
| } | |||
| func (sr *SearchResult) String() string { | |||
| rv := "" | |||
| if sr.Total > 0 { | |||
| @@ -488,3 +524,44 @@ func (sr *SearchResult) Merge(other *SearchResult) { | |||
| sr.Facets.Merge(other.Facets) | |||
| } | |||
| // MemoryNeededForSearchResult is an exported helper function to determine the RAM | |||
| // needed to accommodate the results for a given search request. | |||
| func MemoryNeededForSearchResult(req *SearchRequest) uint64 { | |||
| if req == nil { | |||
| return 0 | |||
| } | |||
| numDocMatches := req.Size + req.From | |||
| if req.Size+req.From > collector.PreAllocSizeSkipCap { | |||
| numDocMatches = collector.PreAllocSizeSkipCap | |||
| } | |||
| estimate := 0 | |||
| // overhead from the SearchResult structure | |||
| var sr SearchResult | |||
| estimate += sr.Size() | |||
| var dm search.DocumentMatch | |||
| sizeOfDocumentMatch := dm.Size() | |||
| // overhead from results | |||
| estimate += numDocMatches * sizeOfDocumentMatch | |||
| // overhead from facet results | |||
| if req.Facets != nil { | |||
| var fr search.FacetResult | |||
| estimate += len(req.Facets) * fr.Size() | |||
| } | |||
| // highlighting, store | |||
| var d document.Document | |||
| if len(req.Fields) > 0 || req.Highlight != nil { | |||
| for i := 0; i < (req.Size + req.From); i++ { | |||
| estimate += (req.Size + req.From) * d.Size() | |||
| } | |||
| } | |||
| return uint64(estimate) | |||
| } | |||
| @@ -30,3 +30,23 @@ type Collector interface { | |||
| SetFacetsBuilder(facetsBuilder *FacetsBuilder) | |||
| FacetResults() FacetResults | |||
| } | |||
| // DocumentMatchHandler is the type of document match callback | |||
| // bleve will invoke during the search. | |||
| // Eventually, bleve will indicate the completion of an ongoing search, | |||
| // by passing a nil value for the document match callback. | |||
| // The application should take a copy of the hit/documentMatch | |||
| // if it wish to own it or need prolonged access to it. | |||
| type DocumentMatchHandler func(hit *DocumentMatch) error | |||
| type MakeDocumentMatchHandlerKeyType string | |||
| var MakeDocumentMatchHandlerKey = MakeDocumentMatchHandlerKeyType( | |||
| "MakeDocumentMatchHandlerKey") | |||
| // MakeDocumentMatchHandler is an optional DocumentMatchHandler | |||
| // builder function which the applications can pass to bleve. | |||
| // These builder methods gives a DocumentMatchHandler function | |||
| // to bleve, which it will invoke on every document matches. | |||
| type MakeDocumentMatchHandler func(ctx *SearchContext) ( | |||
| callback DocumentMatchHandler, loadID bool, err error) | |||
| @@ -25,9 +25,9 @@ type collectStoreHeap struct { | |||
| compare collectorCompare | |||
| } | |||
| func newStoreHeap(cap int, compare collectorCompare) *collectStoreHeap { | |||
| func newStoreHeap(capacity int, compare collectorCompare) *collectStoreHeap { | |||
| rv := &collectStoreHeap{ | |||
| heap: make(search.DocumentMatchCollection, 0, cap), | |||
| heap: make(search.DocumentMatchCollection, 0, capacity), | |||
| compare: compare, | |||
| } | |||
| heap.Init(rv) | |||
| @@ -25,7 +25,7 @@ type collectStoreList struct { | |||
| compare collectorCompare | |||
| } | |||
| func newStoreList(cap int, compare collectorCompare) *collectStoreList { | |||
| func newStoreList(capacity int, compare collectorCompare) *collectStoreList { | |||
| rv := &collectStoreList{ | |||
| results: list.New(), | |||
| compare: compare, | |||
| @@ -34,8 +34,7 @@ func newStoreList(cap int, compare collectorCompare) *collectStoreList { | |||
| return rv | |||
| } | |||
| func (c *collectStoreList) AddNotExceedingSize(doc *search.DocumentMatch, | |||
| size int) *search.DocumentMatch { | |||
| func (c *collectStoreList) AddNotExceedingSize(doc *search.DocumentMatch, size int) *search.DocumentMatch { | |||
| c.add(doc) | |||
| if c.len() > size { | |||
| return c.removeLast() | |||
| @@ -21,9 +21,9 @@ type collectStoreSlice struct { | |||
| compare collectorCompare | |||
| } | |||
| func newStoreSlice(cap int, compare collectorCompare) *collectStoreSlice { | |||
| func newStoreSlice(capacity int, compare collectorCompare) *collectStoreSlice { | |||
| rv := &collectStoreSlice{ | |||
| slice: make(search.DocumentMatchCollection, 0, cap), | |||
| slice: make(search.DocumentMatchCollection, 0, capacity), | |||
| compare: compare, | |||
| } | |||
| return rv | |||
| @@ -16,12 +16,21 @@ package collector | |||
| import ( | |||
| "context" | |||
| "reflect" | |||
| "time" | |||
| "github.com/blevesearch/bleve/index" | |||
| "github.com/blevesearch/bleve/search" | |||
| "github.com/blevesearch/bleve/size" | |||
| ) | |||
| var reflectStaticSizeTopNCollector int | |||
| func init() { | |||
| var coll TopNCollector | |||
| reflectStaticSizeTopNCollector = int(reflect.TypeOf(coll).Size()) | |||
| } | |||
| type collectorStore interface { | |||
| // Add the document, and if the new store size exceeds the provided size | |||
| // the last element is removed and returned. If the size has not been | |||
| @@ -58,6 +67,8 @@ type TopNCollector struct { | |||
| cachedDesc []bool | |||
| lowestMatchOutsideResults *search.DocumentMatch | |||
| updateFieldVisitor index.DocumentFieldTermVisitor | |||
| dvReader index.DocValueReader | |||
| } | |||
| // CheckDoneEvery controls how frequently we check the context deadline | |||
| @@ -98,6 +109,22 @@ func NewTopNCollector(size int, skip int, sort search.SortOrder) *TopNCollector | |||
| return hc | |||
| } | |||
| func (hc *TopNCollector) Size() int { | |||
| sizeInBytes := reflectStaticSizeTopNCollector + size.SizeOfPtr | |||
| if hc.facetsBuilder != nil { | |||
| sizeInBytes += hc.facetsBuilder.Size() | |||
| } | |||
| for _, entry := range hc.neededFields { | |||
| sizeInBytes += len(entry) + size.SizeOfString | |||
| } | |||
| sizeInBytes += len(hc.cachedScoring) + len(hc.cachedDesc) | |||
| return sizeInBytes | |||
| } | |||
| // Collect goes to the index to find the matching documents | |||
| func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher, reader index.IndexReader) error { | |||
| startTime := time.Now() | |||
| @@ -113,8 +140,34 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher, | |||
| } | |||
| searchContext := &search.SearchContext{ | |||
| DocumentMatchPool: search.NewDocumentMatchPool(backingSize+searcher.DocumentMatchPoolSize(), len(hc.sort)), | |||
| Collector: hc, | |||
| } | |||
| hc.dvReader, err = reader.DocValueReader(hc.neededFields) | |||
| if err != nil { | |||
| return err | |||
| } | |||
| hc.updateFieldVisitor = func(field string, term []byte) { | |||
| if hc.facetsBuilder != nil { | |||
| hc.facetsBuilder.UpdateVisitor(field, term) | |||
| } | |||
| hc.sort.UpdateVisitor(field, term) | |||
| } | |||
| dmHandlerMaker := MakeTopNDocumentMatchHandler | |||
| if cv := ctx.Value(search.MakeDocumentMatchHandlerKey); cv != nil { | |||
| dmHandlerMaker = cv.(search.MakeDocumentMatchHandler) | |||
| } | |||
| // use the application given builder for making the custom document match | |||
| // handler and perform callbacks/invocations on the newly made handler. | |||
| dmHandler, loadID, err := dmHandlerMaker(searchContext) | |||
| if err != nil { | |||
| return err | |||
| } | |||
| hc.needDocIds = hc.needDocIds || loadID | |||
| select { | |||
| case <-ctx.Done(): | |||
| return ctx.Err() | |||
| @@ -130,13 +183,26 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher, | |||
| } | |||
| } | |||
| err = hc.collectSingle(searchContext, reader, next) | |||
| err = hc.prepareDocumentMatch(searchContext, reader, next) | |||
| if err != nil { | |||
| break | |||
| } | |||
| err = dmHandler(next) | |||
| if err != nil { | |||
| break | |||
| } | |||
| next, err = searcher.Next(searchContext) | |||
| } | |||
| // help finalize/flush the results in case | |||
| // of custom document match handlers. | |||
| err = dmHandler(nil) | |||
| if err != nil { | |||
| return err | |||
| } | |||
| // compute search duration | |||
| hc.took = time.Since(startTime) | |||
| if err != nil { | |||
| @@ -152,8 +218,8 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher, | |||
| var sortByScoreOpt = []string{"_score"} | |||
| func (hc *TopNCollector) collectSingle(ctx *search.SearchContext, reader index.IndexReader, d *search.DocumentMatch) error { | |||
| var err error | |||
| func (hc *TopNCollector) prepareDocumentMatch(ctx *search.SearchContext, | |||
| reader index.IndexReader, d *search.DocumentMatch) (err error) { | |||
| // visit field terms for features that require it (sort, facets) | |||
| if len(hc.neededFields) > 0 { | |||
| @@ -187,33 +253,49 @@ func (hc *TopNCollector) collectSingle(ctx *search.SearchContext, reader index.I | |||
| hc.sort.Value(d) | |||
| } | |||
| // optimization, we track lowest sorting hit already removed from heap | |||
| // with this one comparison, we can avoid all heap operations if | |||
| // this hit would have been added and then immediately removed | |||
| if hc.lowestMatchOutsideResults != nil { | |||
| cmp := hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, d, hc.lowestMatchOutsideResults) | |||
| if cmp >= 0 { | |||
| // this hit can't possibly be in the result set, so avoid heap ops | |||
| ctx.DocumentMatchPool.Put(d) | |||
| return nil | |||
| } | |||
| } | |||
| return nil | |||
| } | |||
| removed := hc.store.AddNotExceedingSize(d, hc.size+hc.skip) | |||
| if removed != nil { | |||
| if hc.lowestMatchOutsideResults == nil { | |||
| hc.lowestMatchOutsideResults = removed | |||
| } else { | |||
| cmp := hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, removed, hc.lowestMatchOutsideResults) | |||
| if cmp < 0 { | |||
| tmp := hc.lowestMatchOutsideResults | |||
| hc.lowestMatchOutsideResults = removed | |||
| ctx.DocumentMatchPool.Put(tmp) | |||
| func MakeTopNDocumentMatchHandler( | |||
| ctx *search.SearchContext) (search.DocumentMatchHandler, bool, error) { | |||
| var hc *TopNCollector | |||
| var ok bool | |||
| if hc, ok = ctx.Collector.(*TopNCollector); ok { | |||
| return func(d *search.DocumentMatch) error { | |||
| if d == nil { | |||
| return nil | |||
| } | |||
| // optimization, we track lowest sorting hit already removed from heap | |||
| // with this one comparison, we can avoid all heap operations if | |||
| // this hit would have been added and then immediately removed | |||
| if hc.lowestMatchOutsideResults != nil { | |||
| cmp := hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, d, | |||
| hc.lowestMatchOutsideResults) | |||
| if cmp >= 0 { | |||
| // this hit can't possibly be in the result set, so avoid heap ops | |||
| ctx.DocumentMatchPool.Put(d) | |||
| return nil | |||
| } | |||
| } | |||
| } | |||
| } | |||
| return nil | |||
| removed := hc.store.AddNotExceedingSize(d, hc.size+hc.skip) | |||
| if removed != nil { | |||
| if hc.lowestMatchOutsideResults == nil { | |||
| hc.lowestMatchOutsideResults = removed | |||
| } else { | |||
| cmp := hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, | |||
| removed, hc.lowestMatchOutsideResults) | |||
| if cmp < 0 { | |||
| tmp := hc.lowestMatchOutsideResults | |||
| hc.lowestMatchOutsideResults = removed | |||
| ctx.DocumentMatchPool.Put(tmp) | |||
| } | |||
| } | |||
| } | |||
| return nil | |||
| }, false, nil | |||
| } | |||
| return nil, false, nil | |||
| } | |||
| // visitFieldTerms is responsible for visiting the field terms of the | |||
| @@ -223,13 +305,7 @@ func (hc *TopNCollector) visitFieldTerms(reader index.IndexReader, d *search.Doc | |||
| hc.facetsBuilder.StartDoc() | |||
| } | |||
| err := reader.DocumentVisitFieldTerms(d.IndexInternalID, hc.neededFields, func(field string, term []byte) { | |||
| if hc.facetsBuilder != nil { | |||
| hc.facetsBuilder.UpdateVisitor(field, term) | |||
| } | |||
| hc.sort.UpdateVisitor(field, term) | |||
| }) | |||
| err := hc.dvReader.VisitDocValues(d.IndexInternalID, hc.updateFieldVisitor) | |||
| if hc.facetsBuilder != nil { | |||
| hc.facetsBuilder.EndDoc() | |||
| } | |||
| @@ -257,6 +333,7 @@ func (hc *TopNCollector) finalizeResults(r index.IndexReader) error { | |||
| return err | |||
| } | |||
| } | |||
| doc.Complete(nil) | |||
| return nil | |||
| }) | |||
| @@ -288,5 +365,5 @@ func (hc *TopNCollector) FacetResults() search.FacetResults { | |||
| if hc.facetsBuilder != nil { | |||
| return hc.facetsBuilder.Results() | |||
| } | |||
| return search.FacetResults{} | |||
| return nil | |||
| } | |||
| @@ -17,8 +17,18 @@ package search | |||
| import ( | |||
| "encoding/json" | |||
| "fmt" | |||
| "reflect" | |||
| "github.com/blevesearch/bleve/size" | |||
| ) | |||
| var reflectStaticSizeExplanation int | |||
| func init() { | |||
| var e Explanation | |||
| reflectStaticSizeExplanation = int(reflect.TypeOf(e).Size()) | |||
| } | |||
| type Explanation struct { | |||
| Value float64 `json:"value"` | |||
| Message string `json:"message"` | |||
| @@ -32,3 +42,14 @@ func (expl *Explanation) String() string { | |||
| } | |||
| return string(js) | |||
| } | |||
| func (expl *Explanation) Size() int { | |||
| sizeInBytes := reflectStaticSizeExplanation + size.SizeOfPtr + | |||
| len(expl.Message) | |||
| for _, entry := range expl.Children { | |||
| sizeInBytes += entry.Size() | |||
| } | |||
| return sizeInBytes | |||
| } | |||
| @@ -15,13 +15,25 @@ | |||
| package facet | |||
| import ( | |||
| "reflect" | |||
| "sort" | |||
| "time" | |||
| "github.com/blevesearch/bleve/numeric" | |||
| "github.com/blevesearch/bleve/search" | |||
| "github.com/blevesearch/bleve/size" | |||
| ) | |||
| var reflectStaticSizeDateTimeFacetBuilder int | |||
| var reflectStaticSizedateTimeRange int | |||
| func init() { | |||
| var dtfb DateTimeFacetBuilder | |||
| reflectStaticSizeDateTimeFacetBuilder = int(reflect.TypeOf(dtfb).Size()) | |||
| var dtr dateTimeRange | |||
| reflectStaticSizedateTimeRange = int(reflect.TypeOf(dtr).Size()) | |||
| } | |||
| type dateTimeRange struct { | |||
| start time.Time | |||
| end time.Time | |||
| @@ -46,6 +58,23 @@ func NewDateTimeFacetBuilder(field string, size int) *DateTimeFacetBuilder { | |||
| } | |||
| } | |||
| func (fb *DateTimeFacetBuilder) Size() int { | |||
| sizeInBytes := reflectStaticSizeDateTimeFacetBuilder + size.SizeOfPtr + | |||
| len(fb.field) | |||
| for k, _ := range fb.termsCount { | |||
| sizeInBytes += size.SizeOfString + len(k) + | |||
| size.SizeOfInt | |||
| } | |||
| for k, _ := range fb.ranges { | |||
| sizeInBytes += size.SizeOfString + len(k) + | |||
| size.SizeOfPtr + reflectStaticSizedateTimeRange | |||
| } | |||
| return sizeInBytes | |||
| } | |||
| func (fb *DateTimeFacetBuilder) AddRange(name string, start, end time.Time) { | |||
| r := dateTimeRange{ | |||
| start: start, | |||
| @@ -15,12 +15,24 @@ | |||
| package facet | |||
| import ( | |||
| "reflect" | |||
| "sort" | |||
| "github.com/blevesearch/bleve/numeric" | |||
| "github.com/blevesearch/bleve/search" | |||
| "github.com/blevesearch/bleve/size" | |||
| ) | |||
| var reflectStaticSizeNumericFacetBuilder int | |||
| var reflectStaticSizenumericRange int | |||
| func init() { | |||
| var nfb NumericFacetBuilder | |||
| reflectStaticSizeNumericFacetBuilder = int(reflect.TypeOf(nfb).Size()) | |||
| var nr numericRange | |||
| reflectStaticSizenumericRange = int(reflect.TypeOf(nr).Size()) | |||
| } | |||
| type numericRange struct { | |||
| min *float64 | |||
| max *float64 | |||
| @@ -45,6 +57,23 @@ func NewNumericFacetBuilder(field string, size int) *NumericFacetBuilder { | |||
| } | |||
| } | |||
| func (fb *NumericFacetBuilder) Size() int { | |||
| sizeInBytes := reflectStaticSizeNumericFacetBuilder + size.SizeOfPtr + | |||
| len(fb.field) | |||
| for k, _ := range fb.termsCount { | |||
| sizeInBytes += size.SizeOfString + len(k) + | |||
| size.SizeOfInt | |||
| } | |||
| for k, _ := range fb.ranges { | |||
| sizeInBytes += size.SizeOfString + len(k) + | |||
| size.SizeOfPtr + reflectStaticSizenumericRange | |||
| } | |||
| return sizeInBytes | |||
| } | |||
| func (fb *NumericFacetBuilder) AddRange(name string, min, max *float64) { | |||
| r := numericRange{ | |||
| min: min, | |||
| @@ -15,11 +15,20 @@ | |||
| package facet | |||
| import ( | |||
| "reflect" | |||
| "sort" | |||
| "github.com/blevesearch/bleve/search" | |||
| "github.com/blevesearch/bleve/size" | |||
| ) | |||
| var reflectStaticSizeTermsFacetBuilder int | |||
| func init() { | |||
| var tfb TermsFacetBuilder | |||
| reflectStaticSizeTermsFacetBuilder = int(reflect.TypeOf(tfb).Size()) | |||
| } | |||
| type TermsFacetBuilder struct { | |||
| size int | |||
| field string | |||
| @@ -37,6 +46,18 @@ func NewTermsFacetBuilder(field string, size int) *TermsFacetBuilder { | |||
| } | |||
| } | |||
| func (fb *TermsFacetBuilder) Size() int { | |||
| sizeInBytes := reflectStaticSizeTermsFacetBuilder + size.SizeOfPtr + | |||
| len(fb.field) | |||
| for k, _ := range fb.termsCount { | |||
| sizeInBytes += size.SizeOfString + len(k) + | |||
| size.SizeOfInt | |||
| } | |||
| return sizeInBytes | |||
| } | |||
| func (fb *TermsFacetBuilder) Field() string { | |||
| return fb.field | |||
| } | |||
| @@ -15,11 +15,32 @@ | |||
| package search | |||
| import ( | |||
| "reflect" | |||
| "sort" | |||
| "github.com/blevesearch/bleve/index" | |||
| "github.com/blevesearch/bleve/size" | |||
| ) | |||
| var reflectStaticSizeFacetsBuilder int | |||
| var reflectStaticSizeFacetResult int | |||
| var reflectStaticSizeTermFacet int | |||
| var reflectStaticSizeNumericRangeFacet int | |||
| var reflectStaticSizeDateRangeFacet int | |||
| func init() { | |||
| var fb FacetsBuilder | |||
| reflectStaticSizeFacetsBuilder = int(reflect.TypeOf(fb).Size()) | |||
| var fr FacetResult | |||
| reflectStaticSizeFacetResult = int(reflect.TypeOf(fr).Size()) | |||
| var tf TermFacet | |||
| reflectStaticSizeTermFacet = int(reflect.TypeOf(tf).Size()) | |||
| var nrf NumericRangeFacet | |||
| reflectStaticSizeNumericRangeFacet = int(reflect.TypeOf(nrf).Size()) | |||
| var drf DateRangeFacet | |||
| reflectStaticSizeDateRangeFacet = int(reflect.TypeOf(drf).Size()) | |||
| } | |||
| type FacetBuilder interface { | |||
| StartDoc() | |||
| UpdateVisitor(field string, term []byte) | |||
| @@ -27,23 +48,40 @@ type FacetBuilder interface { | |||
| Result() *FacetResult | |||
| Field() string | |||
| Size() int | |||
| } | |||
| type FacetsBuilder struct { | |||
| indexReader index.IndexReader | |||
| facets map[string]FacetBuilder | |||
| facetNames []string | |||
| facets []FacetBuilder | |||
| fields []string | |||
| } | |||
| func NewFacetsBuilder(indexReader index.IndexReader) *FacetsBuilder { | |||
| return &FacetsBuilder{ | |||
| indexReader: indexReader, | |||
| facets: make(map[string]FacetBuilder, 0), | |||
| } | |||
| } | |||
| func (fb *FacetsBuilder) Size() int { | |||
| sizeInBytes := reflectStaticSizeFacetsBuilder + size.SizeOfPtr | |||
| for k, v := range fb.facets { | |||
| sizeInBytes += size.SizeOfString + v.Size() + len(fb.facetNames[k]) | |||
| } | |||
| for _, entry := range fb.fields { | |||
| sizeInBytes += size.SizeOfString + len(entry) | |||
| } | |||
| return sizeInBytes | |||
| } | |||
| func (fb *FacetsBuilder) Add(name string, facetBuilder FacetBuilder) { | |||
| fb.facets[name] = facetBuilder | |||
| fb.facetNames = append(fb.facetNames, name) | |||
| fb.facets = append(fb.facets, facetBuilder) | |||
| fb.fields = append(fb.fields, facetBuilder.Field()) | |||
| } | |||
| @@ -213,6 +251,14 @@ type FacetResult struct { | |||
| DateRanges DateRangeFacets `json:"date_ranges,omitempty"` | |||
| } | |||
| func (fr *FacetResult) Size() int { | |||
| return reflectStaticSizeFacetResult + size.SizeOfPtr + | |||
| len(fr.Field) + | |||
| len(fr.Terms)*(reflectStaticSizeTermFacet+size.SizeOfPtr) + | |||
| len(fr.NumericRanges)*(reflectStaticSizeNumericRangeFacet+size.SizeOfPtr) + | |||
| len(fr.DateRanges)*(reflectStaticSizeDateRangeFacet+size.SizeOfPtr) | |||
| } | |||
| func (fr *FacetResult) Merge(other *FacetResult) { | |||
| fr.Total += other.Total | |||
| fr.Missing += other.Missing | |||
| @@ -287,9 +333,9 @@ func (fr FacetResults) Fixup(name string, size int) { | |||
| func (fb *FacetsBuilder) Results() FacetResults { | |||
| fr := make(FacetResults) | |||
| for facetName, facetBuilder := range fb.facets { | |||
| for i, facetBuilder := range fb.facets { | |||
| facetResult := facetBuilder.Result() | |||
| fr[facetName] = facetResult | |||
| fr[fb.facetNames[i]] = facetResult | |||
| } | |||
| return fr | |||
| } | |||
| @@ -57,15 +57,24 @@ func LevenshteinDistance(a, b string) int { | |||
| // in which case the first return val will be the max | |||
| // and the second will be true, indicating max was exceeded | |||
| func LevenshteinDistanceMax(a, b string, max int) (int, bool) { | |||
| v, wasMax, _ := LevenshteinDistanceMaxReuseSlice(a, b, max, nil) | |||
| return v, wasMax | |||
| } | |||
| func LevenshteinDistanceMaxReuseSlice(a, b string, max int, d []int) (int, bool, []int) { | |||
| la := len(a) | |||
| lb := len(b) | |||
| ld := int(math.Abs(float64(la - lb))) | |||
| if ld > max { | |||
| return max, true | |||
| return max, true, d | |||
| } | |||
| d := make([]int, la+1) | |||
| if cap(d) < la+1 { | |||
| d = make([]int, la+1) | |||
| } | |||
| d = d[:la+1] | |||
| var lastdiag, olddiag, temp int | |||
| for i := 1; i <= la; i++ { | |||
| @@ -98,8 +107,8 @@ func LevenshteinDistanceMax(a, b string, max int) (int, bool) { | |||
| } | |||
| // after each row if rowmin isn't less than max stop | |||
| if rowmin > max { | |||
| return max, true | |||
| return max, true, d | |||
| } | |||
| } | |||
| return d[la], false | |||
| return d[la], false, d | |||
| } | |||
| @@ -14,6 +14,17 @@ | |||
| package search | |||
| import ( | |||
| "reflect" | |||
| ) | |||
| var reflectStaticSizeDocumentMatchPool int | |||
| func init() { | |||
| var dmp DocumentMatchPool | |||
| reflectStaticSizeDocumentMatchPool = int(reflect.TypeOf(dmp).Size()) | |||
| } | |||
| // DocumentMatchPoolTooSmall is a callback function that can be executed | |||
| // when the DocumentMatchPool does not have sufficient capacity | |||
| // By default we just perform just-in-time allocation, but you could log | |||
| @@ -70,9 +70,11 @@ func (q *ConjunctionQuery) Searcher(i index.IndexReader, m mapping.IndexMapping, | |||
| } | |||
| ss = append(ss, sr) | |||
| } | |||
| if len(ss) < 1 { | |||
| return searcher.NewMatchNoneSearcher(i) | |||
| } | |||
| return searcher.NewConjunctionSearcher(i, ss, options) | |||
| } | |||
| @@ -58,7 +58,8 @@ func (q *DisjunctionQuery) SetMin(m float64) { | |||
| q.Min = m | |||
| } | |||
| func (q *DisjunctionQuery) Searcher(i index.IndexReader, m mapping.IndexMapping, options search.SearcherOptions) (search.Searcher, error) { | |||
| func (q *DisjunctionQuery) Searcher(i index.IndexReader, m mapping.IndexMapping, | |||
| options search.SearcherOptions) (search.Searcher, error) { | |||
| ss := make([]search.Searcher, 0, len(q.Disjuncts)) | |||
| for _, disjunct := range q.Disjuncts { | |||
| sr, err := disjunct.Searcher(i, m, options) | |||
| @@ -76,9 +77,17 @@ func (q *DisjunctionQuery) Searcher(i index.IndexReader, m mapping.IndexMapping, | |||
| } | |||
| ss = append(ss, sr) | |||
| } | |||
| if len(ss) < 1 { | |||
| return searcher.NewMatchNoneSearcher(i) | |||
| } else if len(ss) == 1 && int(q.Min) == ss[0].Min() { | |||
| // apply optimization only if both conditions below are satisfied: | |||
| // - disjunction searcher has only 1 child searcher | |||
| // - parent searcher's min setting is equal to child searcher's min | |||
| return ss[0], nil | |||
| } | |||
| return searcher.NewDisjunctionSearcher(i, ss, q.Min, options) | |||
| } | |||
| @@ -296,32 +296,28 @@ func expandQuery(m mapping.IndexMapping, query Query) (Query, error) { | |||
| } | |||
| expand = func(query Query) (Query, error) { | |||
| switch query.(type) { | |||
| switch q := query.(type) { | |||
| case *QueryStringQuery: | |||
| q := query.(*QueryStringQuery) | |||
| parsed, err := parseQuerySyntax(q.Query) | |||
| if err != nil { | |||
| return nil, fmt.Errorf("could not parse '%s': %s", q.Query, err) | |||
| } | |||
| return expand(parsed) | |||
| case *ConjunctionQuery: | |||
| q := *query.(*ConjunctionQuery) | |||
| children, err := expandSlice(q.Conjuncts) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| q.Conjuncts = children | |||
| return &q, nil | |||
| return q, nil | |||
| case *DisjunctionQuery: | |||
| q := *query.(*DisjunctionQuery) | |||
| children, err := expandSlice(q.Disjuncts) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| q.Disjuncts = children | |||
| return &q, nil | |||
| return q, nil | |||
| case *BooleanQuery: | |||
| q := *query.(*BooleanQuery) | |||
| var err error | |||
| q.Must, err = expand(q.Must) | |||
| if err != nil { | |||
| @@ -335,7 +331,7 @@ func expandQuery(m mapping.IndexMapping, query Query) (Query, error) { | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| return &q, nil | |||
| return q, nil | |||
| default: | |||
| return query, nil | |||
| } | |||
| @@ -273,6 +273,7 @@ func inNumOrStrState(l *queryStringLex, next rune, eof bool) (lexState, bool) { | |||
| // see where to go | |||
| if !l.seenDot && next == '.' { | |||
| // stay in this state | |||
| l.seenDot = true | |||
| l.buf += string(next) | |||
| return inNumOrStrState, true | |||
| } else if unicode.IsDigit(next) { | |||
| @@ -15,7 +15,6 @@ | |||
| package query | |||
| import ( | |||
| "regexp" | |||
| "strings" | |||
| "github.com/blevesearch/bleve/index" | |||
| @@ -28,7 +27,6 @@ type RegexpQuery struct { | |||
| Regexp string `json:"regexp"` | |||
| FieldVal string `json:"field,omitempty"` | |||
| BoostVal *Boost `json:"boost,omitempty"` | |||
| compiled *regexp.Regexp | |||
| } | |||
| // NewRegexpQuery creates a new Query which finds | |||
| @@ -64,33 +62,20 @@ func (q *RegexpQuery) Searcher(i index.IndexReader, m mapping.IndexMapping, opti | |||
| if q.FieldVal == "" { | |||
| field = m.DefaultSearchField() | |||
| } | |||
| err := q.compile() | |||
| if err != nil { | |||
| return nil, err | |||
| // require that pattern NOT be anchored to start and end of term. | |||
| // do not attempt to remove trailing $, its presence is not | |||
| // known to interfere with LiteralPrefix() the way ^ does | |||
| // and removing $ introduces possible ambiguities with escaped \$, \\$, etc | |||
| actualRegexp := q.Regexp | |||
| if strings.HasPrefix(actualRegexp, "^") { | |||
| actualRegexp = actualRegexp[1:] // remove leading ^ | |||
| } | |||
| return searcher.NewRegexpSearcher(i, q.compiled, field, q.BoostVal.Value(), options) | |||
| return searcher.NewRegexpStringSearcher(i, actualRegexp, field, | |||
| q.BoostVal.Value(), options) | |||
| } | |||
| func (q *RegexpQuery) Validate() error { | |||
| return q.compile() | |||
| } | |||
| func (q *RegexpQuery) compile() error { | |||
| if q.compiled == nil { | |||
| // require that pattern NOT be anchored to start and end of term | |||
| actualRegexp := q.Regexp | |||
| if strings.HasPrefix(actualRegexp, "^") { | |||
| actualRegexp = actualRegexp[1:] // remove leading ^ | |||
| } | |||
| // do not attempt to remove trailing $, it's presence is not | |||
| // known to interfere with LiteralPrefix() the way ^ does | |||
| // and removing $ introduces possible ambiguities with escaped \$, \\$, etc | |||
| var err error | |||
| q.compiled, err = regexp.Compile(actualRegexp) | |||
| if err != nil { | |||
| return err | |||
| } | |||
| } | |||
| return nil | |||
| return nil // real validation delayed until searcher constructor | |||
| } | |||
| @@ -15,7 +15,6 @@ | |||
| package query | |||
| import ( | |||
| "regexp" | |||
| "strings" | |||
| "github.com/blevesearch/bleve/index" | |||
| @@ -47,7 +46,6 @@ type WildcardQuery struct { | |||
| Wildcard string `json:"wildcard"` | |||
| FieldVal string `json:"field,omitempty"` | |||
| BoostVal *Boost `json:"boost,omitempty"` | |||
| compiled *regexp.Regexp | |||
| } | |||
| // NewWildcardQuery creates a new Query which finds | |||
| @@ -83,24 +81,13 @@ func (q *WildcardQuery) Searcher(i index.IndexReader, m mapping.IndexMapping, op | |||
| if q.FieldVal == "" { | |||
| field = m.DefaultSearchField() | |||
| } | |||
| if q.compiled == nil { | |||
| var err error | |||
| q.compiled, err = q.convertToRegexp() | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| } | |||
| return searcher.NewRegexpSearcher(i, q.compiled, field, q.BoostVal.Value(), options) | |||
| } | |||
| regexpString := wildcardRegexpReplacer.Replace(q.Wildcard) | |||
| func (q *WildcardQuery) Validate() error { | |||
| var err error | |||
| q.compiled, err = q.convertToRegexp() | |||
| return err | |||
| return searcher.NewRegexpStringSearcher(i, regexpString, field, | |||
| q.BoostVal.Value(), options) | |||
| } | |||
| func (q *WildcardQuery) convertToRegexp() (*regexp.Regexp, error) { | |||
| regexpString := wildcardRegexpReplacer.Replace(q.Wildcard) | |||
| return regexp.Compile(regexpString) | |||
| func (q *WildcardQuery) Validate() error { | |||
| return nil // real validation delayed until searcher constructor | |||
| } | |||
| @@ -15,13 +15,27 @@ | |||
| package scorer | |||
| import ( | |||
| "reflect" | |||
| "github.com/blevesearch/bleve/search" | |||
| "github.com/blevesearch/bleve/size" | |||
| ) | |||
| var reflectStaticSizeConjunctionQueryScorer int | |||
| func init() { | |||
| var cqs ConjunctionQueryScorer | |||
| reflectStaticSizeConjunctionQueryScorer = int(reflect.TypeOf(cqs).Size()) | |||
| } | |||
| type ConjunctionQueryScorer struct { | |||
| options search.SearcherOptions | |||
| } | |||
| func (s *ConjunctionQueryScorer) Size() int { | |||
| return reflectStaticSizeConjunctionQueryScorer + size.SizeOfPtr | |||
| } | |||
| func NewConjunctionQueryScorer(options search.SearcherOptions) *ConjunctionQueryScorer { | |||
| return &ConjunctionQueryScorer{ | |||
| options: options, | |||
| @@ -35,15 +49,11 @@ func (s *ConjunctionQueryScorer) Score(ctx *search.SearchContext, constituents [ | |||
| childrenExplanations = make([]*search.Explanation, len(constituents)) | |||
| } | |||
| locations := []search.FieldTermLocationMap{} | |||
| for i, docMatch := range constituents { | |||
| sum += docMatch.Score | |||
| if s.options.Explain { | |||
| childrenExplanations[i] = docMatch.Expl | |||
| } | |||
| if docMatch.Locations != nil { | |||
| locations = append(locations, docMatch.Locations) | |||
| } | |||
| } | |||
| newScore := sum | |||
| var newExpl *search.Explanation | |||
| @@ -55,11 +65,8 @@ func (s *ConjunctionQueryScorer) Score(ctx *search.SearchContext, constituents [ | |||
| rv := constituents[0] | |||
| rv.Score = newScore | |||
| rv.Expl = newExpl | |||
| if len(locations) == 1 { | |||
| rv.Locations = locations[0] | |||
| } else if len(locations) > 1 { | |||
| rv.Locations = search.MergeLocations(locations) | |||
| } | |||
| rv.FieldTermLocations = search.MergeFieldTermLocations( | |||
| rv.FieldTermLocations, constituents[1:]) | |||
| return rv | |||
| } | |||
| @@ -16,11 +16,20 @@ package scorer | |||
| import ( | |||
| "fmt" | |||
| "reflect" | |||
| "github.com/blevesearch/bleve/index" | |||
| "github.com/blevesearch/bleve/search" | |||
| "github.com/blevesearch/bleve/size" | |||
| ) | |||
| var reflectStaticSizeConstantScorer int | |||
| func init() { | |||
| var cs ConstantScorer | |||
| reflectStaticSizeConstantScorer = int(reflect.TypeOf(cs).Size()) | |||
| } | |||
| type ConstantScorer struct { | |||
| constant float64 | |||
| boost float64 | |||
| @@ -30,6 +39,16 @@ type ConstantScorer struct { | |||
| queryWeightExplanation *search.Explanation | |||
| } | |||
| func (s *ConstantScorer) Size() int { | |||
| sizeInBytes := reflectStaticSizeConstantScorer + size.SizeOfPtr | |||
| if s.queryWeightExplanation != nil { | |||
| sizeInBytes += s.queryWeightExplanation.Size() | |||
| } | |||
| return sizeInBytes | |||
| } | |||
| func NewConstantScorer(constant float64, boost float64, options search.SearcherOptions) *ConstantScorer { | |||
| rv := ConstantScorer{ | |||
| options: options, | |||
| @@ -16,14 +16,27 @@ package scorer | |||
| import ( | |||
| "fmt" | |||
| "reflect" | |||
| "github.com/blevesearch/bleve/search" | |||
| "github.com/blevesearch/bleve/size" | |||
| ) | |||
| var reflectStaticSizeDisjunctionQueryScorer int | |||
| func init() { | |||
| var dqs DisjunctionQueryScorer | |||
| reflectStaticSizeDisjunctionQueryScorer = int(reflect.TypeOf(dqs).Size()) | |||
| } | |||
| type DisjunctionQueryScorer struct { | |||
| options search.SearcherOptions | |||
| } | |||
| func (s *DisjunctionQueryScorer) Size() int { | |||
| return reflectStaticSizeDisjunctionQueryScorer + size.SizeOfPtr | |||
| } | |||
| func NewDisjunctionQueryScorer(options search.SearcherOptions) *DisjunctionQueryScorer { | |||
| return &DisjunctionQueryScorer{ | |||
| options: options, | |||
| @@ -37,15 +50,11 @@ func (s *DisjunctionQueryScorer) Score(ctx *search.SearchContext, constituents [ | |||
| childrenExplanations = make([]*search.Explanation, len(constituents)) | |||
| } | |||
| var locations []search.FieldTermLocationMap | |||
| for i, docMatch := range constituents { | |||
| sum += docMatch.Score | |||
| if s.options.Explain { | |||
| childrenExplanations[i] = docMatch.Expl | |||
| } | |||
| if docMatch.Locations != nil { | |||
| locations = append(locations, docMatch.Locations) | |||
| } | |||
| } | |||
| var rawExpl *search.Explanation | |||
| @@ -67,11 +76,8 @@ func (s *DisjunctionQueryScorer) Score(ctx *search.SearchContext, constituents [ | |||
| rv := constituents[0] | |||
| rv.Score = newScore | |||
| rv.Expl = newExpl | |||
| if len(locations) == 1 { | |||
| rv.Locations = locations[0] | |||
| } else if len(locations) > 1 { | |||
| rv.Locations = search.MergeLocations(locations) | |||
| } | |||
| rv.FieldTermLocations = search.MergeFieldTermLocations( | |||
| rv.FieldTermLocations, constituents[1:]) | |||
| return rv | |||
| } | |||
| @@ -17,13 +17,22 @@ package scorer | |||
| import ( | |||
| "fmt" | |||
| "math" | |||
| "reflect" | |||
| "github.com/blevesearch/bleve/index" | |||
| "github.com/blevesearch/bleve/search" | |||
| "github.com/blevesearch/bleve/size" | |||
| ) | |||
| var reflectStaticSizeTermQueryScorer int | |||
| func init() { | |||
| var tqs TermQueryScorer | |||
| reflectStaticSizeTermQueryScorer = int(reflect.TypeOf(tqs).Size()) | |||
| } | |||
| type TermQueryScorer struct { | |||
| queryTerm []byte | |||
| queryTerm string | |||
| queryField string | |||
| queryBoost float64 | |||
| docTerm uint64 | |||
| @@ -36,9 +45,24 @@ type TermQueryScorer struct { | |||
| queryWeightExplanation *search.Explanation | |||
| } | |||
| func (s *TermQueryScorer) Size() int { | |||
| sizeInBytes := reflectStaticSizeTermQueryScorer + size.SizeOfPtr + | |||
| len(s.queryTerm) + len(s.queryField) | |||
| if s.idfExplanation != nil { | |||
| sizeInBytes += s.idfExplanation.Size() | |||
| } | |||
| if s.queryWeightExplanation != nil { | |||
| sizeInBytes += s.queryWeightExplanation.Size() | |||
| } | |||
| return sizeInBytes | |||
| } | |||
| func NewTermQueryScorer(queryTerm []byte, queryField string, queryBoost float64, docTotal, docTerm uint64, options search.SearcherOptions) *TermQueryScorer { | |||
| rv := TermQueryScorer{ | |||
| queryTerm: queryTerm, | |||
| queryTerm: string(queryTerm), | |||
| queryField: queryField, | |||
| queryBoost: queryBoost, | |||
| docTerm: docTerm, | |||
| @@ -82,7 +106,7 @@ func (s *TermQueryScorer) SetQueryNorm(qnorm float64) { | |||
| } | |||
| s.queryWeightExplanation = &search.Explanation{ | |||
| Value: s.queryWeight, | |||
| Message: fmt.Sprintf("queryWeight(%s:%s^%f), product of:", s.queryField, string(s.queryTerm), s.queryBoost), | |||
| Message: fmt.Sprintf("queryWeight(%s:%s^%f), product of:", s.queryField, s.queryTerm, s.queryBoost), | |||
| Children: childrenExplanations, | |||
| } | |||
| } | |||
| @@ -104,7 +128,7 @@ func (s *TermQueryScorer) Score(ctx *search.SearchContext, termMatch *index.Term | |||
| childrenExplanations := make([]*search.Explanation, 3) | |||
| childrenExplanations[0] = &search.Explanation{ | |||
| Value: tf, | |||
| Message: fmt.Sprintf("tf(termFreq(%s:%s)=%d", s.queryField, string(s.queryTerm), termMatch.Freq), | |||
| Message: fmt.Sprintf("tf(termFreq(%s:%s)=%d", s.queryField, s.queryTerm, termMatch.Freq), | |||
| } | |||
| childrenExplanations[1] = &search.Explanation{ | |||
| Value: termMatch.Norm, | |||
| @@ -113,7 +137,7 @@ func (s *TermQueryScorer) Score(ctx *search.SearchContext, termMatch *index.Term | |||
| childrenExplanations[2] = s.idfExplanation | |||
| scoreExplanation = &search.Explanation{ | |||
| Value: score, | |||
| Message: fmt.Sprintf("fieldWeight(%s:%s in %s), product of:", s.queryField, string(s.queryTerm), termMatch.ID), | |||
| Message: fmt.Sprintf("fieldWeight(%s:%s in %s), product of:", s.queryField, s.queryTerm, termMatch.ID), | |||
| Children: childrenExplanations, | |||
| } | |||
| } | |||
| @@ -127,7 +151,7 @@ func (s *TermQueryScorer) Score(ctx *search.SearchContext, termMatch *index.Term | |||
| childExplanations[1] = scoreExplanation | |||
| scoreExplanation = &search.Explanation{ | |||
| Value: score, | |||
| Message: fmt.Sprintf("weight(%s:%s^%f in %s), product of:", s.queryField, string(s.queryTerm), s.queryBoost, termMatch.ID), | |||
| Message: fmt.Sprintf("weight(%s:%s^%f in %s), product of:", s.queryField, s.queryTerm, s.queryBoost, termMatch.ID), | |||
| Children: childExplanations, | |||
| } | |||
| } | |||
| @@ -140,41 +164,31 @@ func (s *TermQueryScorer) Score(ctx *search.SearchContext, termMatch *index.Term | |||
| rv.Expl = scoreExplanation | |||
| } | |||
| if termMatch.Vectors != nil && len(termMatch.Vectors) > 0 { | |||
| locs := make([]search.Location, len(termMatch.Vectors)) | |||
| locsUsed := 0 | |||
| totalPositions := 0 | |||
| for _, v := range termMatch.Vectors { | |||
| totalPositions += len(v.ArrayPositions) | |||
| if len(termMatch.Vectors) > 0 { | |||
| if cap(rv.FieldTermLocations) < len(termMatch.Vectors) { | |||
| rv.FieldTermLocations = make([]search.FieldTermLocation, 0, len(termMatch.Vectors)) | |||
| } | |||
| positions := make(search.ArrayPositions, totalPositions) | |||
| positionsUsed := 0 | |||
| rv.Locations = make(search.FieldTermLocationMap) | |||
| for _, v := range termMatch.Vectors { | |||
| tlm := rv.Locations[v.Field] | |||
| if tlm == nil { | |||
| tlm = make(search.TermLocationMap) | |||
| rv.Locations[v.Field] = tlm | |||
| } | |||
| loc := &locs[locsUsed] | |||
| locsUsed++ | |||
| loc.Pos = v.Pos | |||
| loc.Start = v.Start | |||
| loc.End = v.End | |||
| var ap search.ArrayPositions | |||
| if len(v.ArrayPositions) > 0 { | |||
| loc.ArrayPositions = positions[positionsUsed : positionsUsed+len(v.ArrayPositions)] | |||
| for i, ap := range v.ArrayPositions { | |||
| loc.ArrayPositions[i] = ap | |||
| n := len(rv.FieldTermLocations) | |||
| if n < cap(rv.FieldTermLocations) { // reuse ap slice if available | |||
| ap = rv.FieldTermLocations[:n+1][n].Location.ArrayPositions[:0] | |||
| } | |||
| positionsUsed += len(v.ArrayPositions) | |||
| ap = append(ap, v.ArrayPositions...) | |||
| } | |||
| tlm[string(s.queryTerm)] = append(tlm[string(s.queryTerm)], loc) | |||
| rv.FieldTermLocations = | |||
| append(rv.FieldTermLocations, search.FieldTermLocation{ | |||
| Field: v.Field, | |||
| Term: s.queryTerm, | |||
| Location: search.Location{ | |||
| Pos: v.Pos, | |||
| Start: v.Start, | |||
| End: v.End, | |||
| ArrayPositions: ap, | |||
| }, | |||
| }) | |||
| } | |||
| } | |||
| @@ -16,11 +16,25 @@ package search | |||
| import ( | |||
| "fmt" | |||
| "reflect" | |||
| "github.com/blevesearch/bleve/document" | |||
| "github.com/blevesearch/bleve/index" | |||
| "github.com/blevesearch/bleve/size" | |||
| ) | |||
| var reflectStaticSizeDocumentMatch int | |||
| var reflectStaticSizeSearchContext int | |||
| var reflectStaticSizeLocation int | |||
| func init() { | |||
| var dm DocumentMatch | |||
| reflectStaticSizeDocumentMatch = int(reflect.TypeOf(dm).Size()) | |||
| var sc SearchContext | |||
| reflectStaticSizeSearchContext = int(reflect.TypeOf(sc).Size()) | |||
| var l Location | |||
| reflectStaticSizeLocation = int(reflect.TypeOf(l).Size()) | |||
| } | |||
| type ArrayPositions []uint64 | |||
| func (ap ArrayPositions) Equals(other ArrayPositions) bool { | |||
| @@ -47,6 +61,11 @@ type Location struct { | |||
| ArrayPositions ArrayPositions `json:"array_positions"` | |||
| } | |||
| func (l *Location) Size() int { | |||
| return reflectStaticSizeLocation + size.SizeOfPtr + | |||
| len(l.ArrayPositions)*size.SizeOfUint64 | |||
| } | |||
| type Locations []*Location | |||
| type TermLocationMap map[string]Locations | |||
| @@ -57,6 +76,12 @@ func (t TermLocationMap) AddLocation(term string, location *Location) { | |||
| type FieldTermLocationMap map[string]TermLocationMap | |||
| type FieldTermLocation struct { | |||
| Field string | |||
| Term string | |||
| Location Location | |||
| } | |||
| type FieldFragmentMap map[string][]string | |||
| type DocumentMatch struct { | |||
| @@ -74,11 +99,14 @@ type DocumentMatch struct { | |||
| // fields as float64s and date fields as time.RFC3339 formatted strings. | |||
| Fields map[string]interface{} `json:"fields,omitempty"` | |||
| // if we load the document for this hit, remember it so we dont load again | |||
| Document *document.Document `json:"-"` | |||
| // used to maintain natural index order | |||
| HitNumber uint64 `json:"-"` | |||
| // used to temporarily hold field term location information during | |||
| // search processing in an efficient, recycle-friendly manner, to | |||
| // be later incorporated into the Locations map when search | |||
| // results are completed | |||
| FieldTermLocations []FieldTermLocation `json:"-"` | |||
| } | |||
| func (dm *DocumentMatch) AddFieldValue(name string, value interface{}) { | |||
| @@ -108,15 +136,116 @@ func (dm *DocumentMatch) Reset() *DocumentMatch { | |||
| indexInternalID := dm.IndexInternalID | |||
| // remember the []interface{} used for sort | |||
| sort := dm.Sort | |||
| // remember the FieldTermLocations backing array | |||
| ftls := dm.FieldTermLocations | |||
| for i := range ftls { // recycle the ArrayPositions of each location | |||
| ftls[i].Location.ArrayPositions = ftls[i].Location.ArrayPositions[:0] | |||
| } | |||
| // idiom to copy over from empty DocumentMatch (0 allocations) | |||
| *dm = DocumentMatch{} | |||
| // reuse the []byte already allocated (and reset len to 0) | |||
| dm.IndexInternalID = indexInternalID[:0] | |||
| // reuse the []interface{} already allocated (and reset len to 0) | |||
| dm.Sort = sort[:0] | |||
| // reuse the FieldTermLocations already allocated (and reset len to 0) | |||
| dm.FieldTermLocations = ftls[:0] | |||
| return dm | |||
| } | |||
| func (dm *DocumentMatch) Size() int { | |||
| sizeInBytes := reflectStaticSizeDocumentMatch + size.SizeOfPtr + | |||
| len(dm.Index) + | |||
| len(dm.ID) + | |||
| len(dm.IndexInternalID) | |||
| if dm.Expl != nil { | |||
| sizeInBytes += dm.Expl.Size() | |||
| } | |||
| for k, v := range dm.Locations { | |||
| sizeInBytes += size.SizeOfString + len(k) | |||
| for k1, v1 := range v { | |||
| sizeInBytes += size.SizeOfString + len(k1) + | |||
| size.SizeOfSlice | |||
| for _, entry := range v1 { | |||
| sizeInBytes += entry.Size() | |||
| } | |||
| } | |||
| } | |||
| for k, v := range dm.Fragments { | |||
| sizeInBytes += size.SizeOfString + len(k) + | |||
| size.SizeOfSlice | |||
| for _, entry := range v { | |||
| sizeInBytes += size.SizeOfString + len(entry) | |||
| } | |||
| } | |||
| for _, entry := range dm.Sort { | |||
| sizeInBytes += size.SizeOfString + len(entry) | |||
| } | |||
| for k, _ := range dm.Fields { | |||
| sizeInBytes += size.SizeOfString + len(k) + | |||
| size.SizeOfPtr | |||
| } | |||
| return sizeInBytes | |||
| } | |||
| // Complete performs final preparation & transformation of the | |||
| // DocumentMatch at the end of search processing, also allowing the | |||
| // caller to provide an optional preallocated locations slice | |||
| func (dm *DocumentMatch) Complete(prealloc []Location) []Location { | |||
| // transform the FieldTermLocations slice into the Locations map | |||
| nlocs := len(dm.FieldTermLocations) | |||
| if nlocs > 0 { | |||
| if cap(prealloc) < nlocs { | |||
| prealloc = make([]Location, nlocs) | |||
| } | |||
| prealloc = prealloc[:nlocs] | |||
| var lastField string | |||
| var tlm TermLocationMap | |||
| for i, ftl := range dm.FieldTermLocations { | |||
| if lastField != ftl.Field { | |||
| lastField = ftl.Field | |||
| if dm.Locations == nil { | |||
| dm.Locations = make(FieldTermLocationMap) | |||
| } | |||
| tlm = dm.Locations[ftl.Field] | |||
| if tlm == nil { | |||
| tlm = make(TermLocationMap) | |||
| dm.Locations[ftl.Field] = tlm | |||
| } | |||
| } | |||
| loc := &prealloc[i] | |||
| *loc = ftl.Location | |||
| if len(loc.ArrayPositions) > 0 { // copy | |||
| loc.ArrayPositions = append(ArrayPositions(nil), loc.ArrayPositions...) | |||
| } | |||
| tlm[ftl.Term] = append(tlm[ftl.Term], loc) | |||
| dm.FieldTermLocations[i] = FieldTermLocation{ // recycle | |||
| Location: Location{ | |||
| ArrayPositions: ftl.Location.ArrayPositions[:0], | |||
| }, | |||
| } | |||
| } | |||
| } | |||
| dm.FieldTermLocations = dm.FieldTermLocations[:0] // recycle | |||
| return prealloc | |||
| } | |||
| func (dm *DocumentMatch) String() string { | |||
| return fmt.Sprintf("[%s-%f]", string(dm.IndexInternalID), dm.Score) | |||
| } | |||
| @@ -135,6 +264,7 @@ type Searcher interface { | |||
| SetQueryNorm(float64) | |||
| Count() uint64 | |||
| Min() int | |||
| Size() int | |||
| DocumentMatchPoolSize() int | |||
| } | |||
| @@ -142,9 +272,26 @@ type Searcher interface { | |||
| type SearcherOptions struct { | |||
| Explain bool | |||
| IncludeTermVectors bool | |||
| Score string | |||
| } | |||
| // SearchContext represents the context around a single search | |||
| type SearchContext struct { | |||
| DocumentMatchPool *DocumentMatchPool | |||
| Collector Collector | |||
| } | |||
| func (sc *SearchContext) Size() int { | |||
| sizeInBytes := reflectStaticSizeSearchContext + size.SizeOfPtr + | |||
| reflectStaticSizeDocumentMatchPool + size.SizeOfPtr | |||
| if sc.DocumentMatchPool != nil { | |||
| for _, entry := range sc.DocumentMatchPool.avail { | |||
| if entry != nil { | |||
| sizeInBytes += entry.Size() | |||
| } | |||
| } | |||
| } | |||
| return sizeInBytes | |||
| } | |||
| @@ -16,12 +16,21 @@ package searcher | |||
| import ( | |||
| "math" | |||
| "reflect" | |||
| "github.com/blevesearch/bleve/index" | |||
| "github.com/blevesearch/bleve/search" | |||
| "github.com/blevesearch/bleve/search/scorer" | |||
| "github.com/blevesearch/bleve/size" | |||
| ) | |||
| var reflectStaticSizeBooleanSearcher int | |||
| func init() { | |||
| var bs BooleanSearcher | |||
| reflectStaticSizeBooleanSearcher = int(reflect.TypeOf(bs).Size()) | |||
| } | |||
| type BooleanSearcher struct { | |||
| indexReader index.IndexReader | |||
| mustSearcher search.Searcher | |||
| @@ -52,6 +61,32 @@ func NewBooleanSearcher(indexReader index.IndexReader, mustSearcher search.Searc | |||
| return &rv, nil | |||
| } | |||
| func (s *BooleanSearcher) Size() int { | |||
| sizeInBytes := reflectStaticSizeBooleanSearcher + size.SizeOfPtr | |||
| if s.mustSearcher != nil { | |||
| sizeInBytes += s.mustSearcher.Size() | |||
| } | |||
| if s.shouldSearcher != nil { | |||
| sizeInBytes += s.shouldSearcher.Size() | |||
| } | |||
| if s.mustNotSearcher != nil { | |||
| sizeInBytes += s.mustNotSearcher.Size() | |||
| } | |||
| sizeInBytes += s.scorer.Size() | |||
| for _, entry := range s.matches { | |||
| if entry != nil { | |||
| sizeInBytes += entry.Size() | |||
| } | |||
| } | |||
| return sizeInBytes | |||
| } | |||
| func (s *BooleanSearcher) computeQueryNorm() { | |||
| // first calculate sum of squared weights | |||
| sumOfSquaredWeights := 0.0 | |||
| @@ -284,6 +319,7 @@ func (s *BooleanSearcher) Next(ctx *search.SearchContext) (*search.DocumentMatch | |||
| return nil, err | |||
| } | |||
| } | |||
| return rv, nil | |||
| } | |||
| @@ -296,41 +332,52 @@ func (s *BooleanSearcher) Advance(ctx *search.SearchContext, ID index.IndexInter | |||
| } | |||
| } | |||
| var err error | |||
| if s.mustSearcher != nil { | |||
| if s.currMust != nil { | |||
| ctx.DocumentMatchPool.Put(s.currMust) | |||
| } | |||
| s.currMust, err = s.mustSearcher.Advance(ctx, ID) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| } | |||
| if s.shouldSearcher != nil { | |||
| if s.currShould != nil { | |||
| ctx.DocumentMatchPool.Put(s.currShould) | |||
| } | |||
| s.currShould, err = s.shouldSearcher.Advance(ctx, ID) | |||
| if err != nil { | |||
| return nil, err | |||
| // Advance the searchers only if the currentID cursor is trailing the lookup ID, | |||
| // additionally if the mustNotSearcher has been initialized, ensure that the | |||
| // cursor used to track the mustNotSearcher (currMustNot, which isn't tracked by | |||
| // currentID) is trailing the lookup ID as well - for in the case where currentID | |||
| // is nil and currMustNot is already at or ahead of the lookup ID, we MUST NOT | |||
| // advance the currentID or the currMustNot cursors. | |||
| if (s.currentID == nil || s.currentID.Compare(ID) < 0) && | |||
| (s.currMustNot == nil || s.currMustNot.IndexInternalID.Compare(ID) < 0) { | |||
| var err error | |||
| if s.mustSearcher != nil { | |||
| if s.currMust != nil { | |||
| ctx.DocumentMatchPool.Put(s.currMust) | |||
| } | |||
| s.currMust, err = s.mustSearcher.Advance(ctx, ID) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| } | |||
| } | |||
| if s.mustNotSearcher != nil { | |||
| if s.currMustNot != nil { | |||
| ctx.DocumentMatchPool.Put(s.currMustNot) | |||
| if s.shouldSearcher != nil { | |||
| if s.currShould != nil { | |||
| ctx.DocumentMatchPool.Put(s.currShould) | |||
| } | |||
| s.currShould, err = s.shouldSearcher.Advance(ctx, ID) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| } | |||
| s.currMustNot, err = s.mustNotSearcher.Advance(ctx, ID) | |||
| if err != nil { | |||
| return nil, err | |||
| if s.mustNotSearcher != nil { | |||
| if s.currMustNot != nil { | |||
| ctx.DocumentMatchPool.Put(s.currMustNot) | |||
| } | |||
| s.currMustNot, err = s.mustNotSearcher.Advance(ctx, ID) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| } | |||
| } | |||
| if s.mustSearcher != nil && s.currMust != nil { | |||
| s.currentID = s.currMust.IndexInternalID | |||
| } else if s.mustSearcher == nil && s.currShould != nil { | |||
| s.currentID = s.currShould.IndexInternalID | |||
| } else { | |||
| s.currentID = nil | |||
| if s.mustSearcher != nil && s.currMust != nil { | |||
| s.currentID = s.currMust.IndexInternalID | |||
| } else if s.mustSearcher == nil && s.currShould != nil { | |||
| s.currentID = s.currShould.IndexInternalID | |||
| } else { | |||
| s.currentID = nil | |||
| } | |||
| } | |||
| return s.Next(ctx) | |||
| @@ -16,13 +16,22 @@ package searcher | |||
| import ( | |||
| "math" | |||
| "reflect" | |||
| "sort" | |||
| "github.com/blevesearch/bleve/index" | |||
| "github.com/blevesearch/bleve/search" | |||
| "github.com/blevesearch/bleve/search/scorer" | |||
| "github.com/blevesearch/bleve/size" | |||
| ) | |||
| var reflectStaticSizeConjunctionSearcher int | |||
| func init() { | |||
| var cs ConjunctionSearcher | |||
| reflectStaticSizeConjunctionSearcher = int(reflect.TypeOf(cs).Size()) | |||
| } | |||
| type ConjunctionSearcher struct { | |||
| indexReader index.IndexReader | |||
| searchers OrderedSearcherList | |||
| @@ -34,14 +43,27 @@ type ConjunctionSearcher struct { | |||
| options search.SearcherOptions | |||
| } | |||
| func NewConjunctionSearcher(indexReader index.IndexReader, qsearchers []search.Searcher, options search.SearcherOptions) (*ConjunctionSearcher, error) { | |||
| // build the downstream searchers | |||
| func NewConjunctionSearcher(indexReader index.IndexReader, | |||
| qsearchers []search.Searcher, options search.SearcherOptions) ( | |||
| search.Searcher, error) { | |||
| // build the sorted downstream searchers | |||
| searchers := make(OrderedSearcherList, len(qsearchers)) | |||
| for i, searcher := range qsearchers { | |||
| searchers[i] = searcher | |||
| } | |||
| // sort the searchers | |||
| sort.Sort(searchers) | |||
| // attempt the "unadorned" conjunction optimization only when we | |||
| // do not need extra information like freq-norm's or term vectors | |||
| if len(searchers) > 1 && | |||
| options.Score == "none" && !options.IncludeTermVectors { | |||
| rv, err := optimizeCompositeSearcher("conjunction:unadorned", | |||
| indexReader, searchers, options) | |||
| if err != nil || rv != nil { | |||
| return rv, err | |||
| } | |||
| } | |||
| // build our searcher | |||
| rv := ConjunctionSearcher{ | |||
| indexReader: indexReader, | |||
| @@ -51,9 +73,36 @@ func NewConjunctionSearcher(indexReader index.IndexReader, qsearchers []search.S | |||
| scorer: scorer.NewConjunctionQueryScorer(options), | |||
| } | |||
| rv.computeQueryNorm() | |||
| // attempt push-down conjunction optimization when there's >1 searchers | |||
| if len(searchers) > 1 { | |||
| rv, err := optimizeCompositeSearcher("conjunction", | |||
| indexReader, searchers, options) | |||
| if err != nil || rv != nil { | |||
| return rv, err | |||
| } | |||
| } | |||
| return &rv, nil | |||
| } | |||
| func (s *ConjunctionSearcher) Size() int { | |||
| sizeInBytes := reflectStaticSizeConjunctionSearcher + size.SizeOfPtr + | |||
| s.scorer.Size() | |||
| for _, entry := range s.searchers { | |||
| sizeInBytes += entry.Size() | |||
| } | |||
| for _, entry := range s.currs { | |||
| if entry != nil { | |||
| sizeInBytes += entry.Size() | |||
| } | |||
| } | |||
| return sizeInBytes | |||
| } | |||
| func (s *ConjunctionSearcher) computeQueryNorm() { | |||
| // first calculate sum of squared weights | |||
| sumOfSquaredWeights := 0.0 | |||
| @@ -108,7 +157,7 @@ func (s *ConjunctionSearcher) Next(ctx *search.SearchContext) (*search.DocumentM | |||
| var rv *search.DocumentMatch | |||
| var err error | |||
| OUTER: | |||
| for s.currs[s.maxIDIdx] != nil { | |||
| for s.maxIDIdx < len(s.currs) && s.currs[s.maxIDIdx] != nil { | |||
| maxID := s.currs[s.maxIDIdx].IndexInternalID | |||
| i := 0 | |||
| @@ -1,4 +1,4 @@ | |||
| // Copyright (c) 2014 Couchbase, Inc. | |||
| // Copyright (c) 2018 Couchbase, Inc. | |||
| // | |||
| // Licensed under the Apache License, Version 2.0 (the "License"); | |||
| // you may not use this file except in compliance with the License. | |||
| @@ -16,12 +16,9 @@ package searcher | |||
| import ( | |||
| "fmt" | |||
| "math" | |||
| "sort" | |||
| "github.com/blevesearch/bleve/index" | |||
| "github.com/blevesearch/bleve/search" | |||
| "github.com/blevesearch/bleve/search/scorer" | |||
| ) | |||
| // DisjunctionMaxClauseCount is a compile time setting that applications can | |||
| @@ -29,246 +26,84 @@ import ( | |||
| // error instead of exeucting searches when the size exceeds this value. | |||
| var DisjunctionMaxClauseCount = 0 | |||
| type DisjunctionSearcher struct { | |||
| indexReader index.IndexReader | |||
| searchers OrderedSearcherList | |||
| numSearchers int | |||
| queryNorm float64 | |||
| currs []*search.DocumentMatch | |||
| scorer *scorer.DisjunctionQueryScorer | |||
| min int | |||
| matching []*search.DocumentMatch | |||
| matchingIdxs []int | |||
| initialized bool | |||
| } | |||
| func tooManyClauses(count int) bool { | |||
| if DisjunctionMaxClauseCount != 0 && count > DisjunctionMaxClauseCount { | |||
| return true | |||
| } | |||
| return false | |||
| } | |||
| func tooManyClausesErr() error { | |||
| return fmt.Errorf("TooManyClauses[maxClauseCount is set to %d]", | |||
| DisjunctionMaxClauseCount) | |||
| } | |||
| // DisjunctionHeapTakeover is a compile time setting that applications can | |||
| // adjust to control when the DisjunctionSearcher will switch from a simple | |||
| // slice implementation to a heap implementation. | |||
| var DisjunctionHeapTakeover = 10 | |||
| func NewDisjunctionSearcher(indexReader index.IndexReader, | |||
| qsearchers []search.Searcher, min float64, options search.SearcherOptions) ( | |||
| *DisjunctionSearcher, error) { | |||
| return newDisjunctionSearcher(indexReader, qsearchers, min, options, | |||
| true) | |||
| search.Searcher, error) { | |||
| return newDisjunctionSearcher(indexReader, qsearchers, min, options, true) | |||
| } | |||
| func newDisjunctionSearcher(indexReader index.IndexReader, | |||
| qsearchers []search.Searcher, min float64, options search.SearcherOptions, | |||
| limit bool) ( | |||
| *DisjunctionSearcher, error) { | |||
| if limit && tooManyClauses(len(qsearchers)) { | |||
| return nil, tooManyClausesErr() | |||
| } | |||
| // build the downstream searchers | |||
| searchers := make(OrderedSearcherList, len(qsearchers)) | |||
| for i, searcher := range qsearchers { | |||
| searchers[i] = searcher | |||
| } | |||
| // sort the searchers | |||
| sort.Sort(sort.Reverse(searchers)) | |||
| // build our searcher | |||
| rv := DisjunctionSearcher{ | |||
| indexReader: indexReader, | |||
| searchers: searchers, | |||
| numSearchers: len(searchers), | |||
| currs: make([]*search.DocumentMatch, len(searchers)), | |||
| scorer: scorer.NewDisjunctionQueryScorer(options), | |||
| min: int(min), | |||
| matching: make([]*search.DocumentMatch, len(searchers)), | |||
| matchingIdxs: make([]int, len(searchers)), | |||
| } | |||
| rv.computeQueryNorm() | |||
| return &rv, nil | |||
| } | |||
| func (s *DisjunctionSearcher) computeQueryNorm() { | |||
| // first calculate sum of squared weights | |||
| sumOfSquaredWeights := 0.0 | |||
| for _, searcher := range s.searchers { | |||
| sumOfSquaredWeights += searcher.Weight() | |||
| } | |||
| // now compute query norm from this | |||
| s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights) | |||
| // finally tell all the downstream searchers the norm | |||
| for _, searcher := range s.searchers { | |||
| searcher.SetQueryNorm(s.queryNorm) | |||
| } | |||
| } | |||
| func (s *DisjunctionSearcher) initSearchers(ctx *search.SearchContext) error { | |||
| var err error | |||
| // get all searchers pointing at their first match | |||
| for i, searcher := range s.searchers { | |||
| if s.currs[i] != nil { | |||
| ctx.DocumentMatchPool.Put(s.currs[i]) | |||
| } | |||
| s.currs[i], err = searcher.Next(ctx) | |||
| if err != nil { | |||
| return err | |||
| limit bool) (search.Searcher, error) { | |||
| // attempt the "unadorned" disjunction optimization only when we | |||
| // do not need extra information like freq-norm's or term vectors | |||
| // and the requested min is simple | |||
| if len(qsearchers) > 1 && min <= 1 && | |||
| options.Score == "none" && !options.IncludeTermVectors { | |||
| rv, err := optimizeCompositeSearcher("disjunction:unadorned", | |||
| indexReader, qsearchers, options) | |||
| if err != nil || rv != nil { | |||
| return rv, err | |||
| } | |||
| } | |||
| err = s.updateMatches() | |||
| if err != nil { | |||
| return err | |||
| if len(qsearchers) > DisjunctionHeapTakeover { | |||
| return newDisjunctionHeapSearcher(indexReader, qsearchers, min, options, | |||
| limit) | |||
| } | |||
| s.initialized = true | |||
| return nil | |||
| return newDisjunctionSliceSearcher(indexReader, qsearchers, min, options, | |||
| limit) | |||
| } | |||
| func (s *DisjunctionSearcher) updateMatches() error { | |||
| matching := s.matching[:0] | |||
| matchingIdxs := s.matchingIdxs[:0] | |||
| for i := 0; i < len(s.currs); i++ { | |||
| curr := s.currs[i] | |||
| if curr == nil { | |||
| continue | |||
| } | |||
| if len(matching) > 0 { | |||
| cmp := curr.IndexInternalID.Compare(matching[0].IndexInternalID) | |||
| if cmp > 0 { | |||
| continue | |||
| } | |||
| func optimizeCompositeSearcher(optimizationKind string, | |||
| indexReader index.IndexReader, qsearchers []search.Searcher, | |||
| options search.SearcherOptions) (search.Searcher, error) { | |||
| var octx index.OptimizableContext | |||
| if cmp < 0 { | |||
| matching = matching[:0] | |||
| matchingIdxs = matchingIdxs[:0] | |||
| } | |||
| for _, searcher := range qsearchers { | |||
| o, ok := searcher.(index.Optimizable) | |||
| if !ok { | |||
| return nil, nil | |||
| } | |||
| matching = append(matching, curr) | |||
| matchingIdxs = append(matchingIdxs, i) | |||
| } | |||
| s.matching = matching | |||
| s.matchingIdxs = matchingIdxs | |||
| return nil | |||
| } | |||
| func (s *DisjunctionSearcher) Weight() float64 { | |||
| var rv float64 | |||
| for _, searcher := range s.searchers { | |||
| rv += searcher.Weight() | |||
| } | |||
| return rv | |||
| } | |||
| func (s *DisjunctionSearcher) SetQueryNorm(qnorm float64) { | |||
| for _, searcher := range s.searchers { | |||
| searcher.SetQueryNorm(qnorm) | |||
| } | |||
| } | |||
| func (s *DisjunctionSearcher) Next(ctx *search.SearchContext) ( | |||
| *search.DocumentMatch, error) { | |||
| if !s.initialized { | |||
| err := s.initSearchers(ctx) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| } | |||
| var err error | |||
| var rv *search.DocumentMatch | |||
| found := false | |||
| for !found && len(s.matching) > 0 { | |||
| if len(s.matching) >= s.min { | |||
| found = true | |||
| // score this match | |||
| rv = s.scorer.Score(ctx, s.matching, len(s.matching), s.numSearchers) | |||
| } | |||
| // invoke next on all the matching searchers | |||
| for _, i := range s.matchingIdxs { | |||
| searcher := s.searchers[i] | |||
| if s.currs[i] != rv { | |||
| ctx.DocumentMatchPool.Put(s.currs[i]) | |||
| } | |||
| s.currs[i], err = searcher.Next(ctx) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| } | |||
| err = s.updateMatches() | |||
| var err error | |||
| octx, err = o.Optimize(optimizationKind, octx) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| } | |||
| return rv, nil | |||
| } | |||
| func (s *DisjunctionSearcher) Advance(ctx *search.SearchContext, | |||
| ID index.IndexInternalID) (*search.DocumentMatch, error) { | |||
| if !s.initialized { | |||
| err := s.initSearchers(ctx) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| } | |||
| // get all searchers pointing at their first match | |||
| var err error | |||
| for i, searcher := range s.searchers { | |||
| if s.currs[i] != nil { | |||
| if s.currs[i].IndexInternalID.Compare(ID) >= 0 { | |||
| continue | |||
| } | |||
| ctx.DocumentMatchPool.Put(s.currs[i]) | |||
| } | |||
| s.currs[i], err = searcher.Advance(ctx, ID) | |||
| if err != nil { | |||
| return nil, err | |||
| if octx == nil { | |||
| return nil, nil | |||
| } | |||
| } | |||
| err = s.updateMatches() | |||
| if err != nil { | |||
| optimized, err := octx.Finish() | |||
| if err != nil || optimized == nil { | |||
| return nil, err | |||
| } | |||
| return s.Next(ctx) | |||
| } | |||
| func (s *DisjunctionSearcher) Count() uint64 { | |||
| // for now return a worst case | |||
| var sum uint64 | |||
| for _, searcher := range s.searchers { | |||
| sum += searcher.Count() | |||
| tfr, ok := optimized.(index.TermFieldReader) | |||
| if !ok { | |||
| return nil, nil | |||
| } | |||
| return sum | |||
| } | |||
| func (s *DisjunctionSearcher) Close() (rv error) { | |||
| for _, searcher := range s.searchers { | |||
| err := searcher.Close() | |||
| if err != nil && rv == nil { | |||
| rv = err | |||
| } | |||
| } | |||
| return rv | |||
| return newTermSearcherFromReader(indexReader, tfr, | |||
| []byte(optimizationKind), "*", 1.0, options) | |||
| } | |||
| func (s *DisjunctionSearcher) Min() int { | |||
| return s.min | |||
| func tooManyClauses(count int) bool { | |||
| if DisjunctionMaxClauseCount != 0 && count > DisjunctionMaxClauseCount { | |||
| return true | |||
| } | |||
| return false | |||
| } | |||
| func (s *DisjunctionSearcher) DocumentMatchPoolSize() int { | |||
| rv := len(s.currs) | |||
| for _, s := range s.searchers { | |||
| rv += s.DocumentMatchPoolSize() | |||
| } | |||
| return rv | |||
| func tooManyClausesErr(count int) error { | |||
| return fmt.Errorf("TooManyClauses[%d > maxClauseCount, which is set to %d]", | |||
| count, DisjunctionMaxClauseCount) | |||
| } | |||
| @@ -0,0 +1,343 @@ | |||
| // Copyright (c) 2018 Couchbase, Inc. | |||
| // | |||
| // Licensed under the Apache License, Version 2.0 (the "License"); | |||
| // you may not use this file except in compliance with the License. | |||
| // You may obtain a copy of the License at | |||
| // | |||
| // http://www.apache.org/licenses/LICENSE-2.0 | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software | |||
| // distributed under the License is distributed on an "AS IS" BASIS, | |||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| // See the License for the specific language governing permissions and | |||
| // limitations under the License. | |||
| package searcher | |||
| import ( | |||
| "bytes" | |||
| "container/heap" | |||
| "math" | |||
| "reflect" | |||
| "github.com/blevesearch/bleve/index" | |||
| "github.com/blevesearch/bleve/search" | |||
| "github.com/blevesearch/bleve/search/scorer" | |||
| "github.com/blevesearch/bleve/size" | |||
| ) | |||
| var reflectStaticSizeDisjunctionHeapSearcher int | |||
| var reflectStaticSizeSearcherCurr int | |||
| func init() { | |||
| var dhs DisjunctionHeapSearcher | |||
| reflectStaticSizeDisjunctionHeapSearcher = int(reflect.TypeOf(dhs).Size()) | |||
| var sc SearcherCurr | |||
| reflectStaticSizeSearcherCurr = int(reflect.TypeOf(sc).Size()) | |||
| } | |||
| type SearcherCurr struct { | |||
| searcher search.Searcher | |||
| curr *search.DocumentMatch | |||
| } | |||
| type DisjunctionHeapSearcher struct { | |||
| indexReader index.IndexReader | |||
| numSearchers int | |||
| scorer *scorer.DisjunctionQueryScorer | |||
| min int | |||
| queryNorm float64 | |||
| initialized bool | |||
| searchers []search.Searcher | |||
| heap []*SearcherCurr | |||
| matching []*search.DocumentMatch | |||
| matchingCurrs []*SearcherCurr | |||
| } | |||
| func newDisjunctionHeapSearcher(indexReader index.IndexReader, | |||
| searchers []search.Searcher, min float64, options search.SearcherOptions, | |||
| limit bool) ( | |||
| *DisjunctionHeapSearcher, error) { | |||
| if limit && tooManyClauses(len(searchers)) { | |||
| return nil, tooManyClausesErr(len(searchers)) | |||
| } | |||
| // build our searcher | |||
| rv := DisjunctionHeapSearcher{ | |||
| indexReader: indexReader, | |||
| searchers: searchers, | |||
| numSearchers: len(searchers), | |||
| scorer: scorer.NewDisjunctionQueryScorer(options), | |||
| min: int(min), | |||
| matching: make([]*search.DocumentMatch, len(searchers)), | |||
| matchingCurrs: make([]*SearcherCurr, len(searchers)), | |||
| heap: make([]*SearcherCurr, 0, len(searchers)), | |||
| } | |||
| rv.computeQueryNorm() | |||
| return &rv, nil | |||
| } | |||
| func (s *DisjunctionHeapSearcher) Size() int { | |||
| sizeInBytes := reflectStaticSizeDisjunctionHeapSearcher + size.SizeOfPtr + | |||
| s.scorer.Size() | |||
| for _, entry := range s.searchers { | |||
| sizeInBytes += entry.Size() | |||
| } | |||
| for _, entry := range s.matching { | |||
| if entry != nil { | |||
| sizeInBytes += entry.Size() | |||
| } | |||
| } | |||
| // for matchingCurrs and heap, just use static size * len | |||
| // since searchers and document matches already counted above | |||
| sizeInBytes += len(s.matchingCurrs) * reflectStaticSizeSearcherCurr | |||
| sizeInBytes += len(s.heap) * reflectStaticSizeSearcherCurr | |||
| return sizeInBytes | |||
| } | |||
| func (s *DisjunctionHeapSearcher) computeQueryNorm() { | |||
| // first calculate sum of squared weights | |||
| sumOfSquaredWeights := 0.0 | |||
| for _, searcher := range s.searchers { | |||
| sumOfSquaredWeights += searcher.Weight() | |||
| } | |||
| // now compute query norm from this | |||
| s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights) | |||
| // finally tell all the downstream searchers the norm | |||
| for _, searcher := range s.searchers { | |||
| searcher.SetQueryNorm(s.queryNorm) | |||
| } | |||
| } | |||
| func (s *DisjunctionHeapSearcher) initSearchers(ctx *search.SearchContext) error { | |||
| // alloc a single block of SearcherCurrs | |||
| block := make([]SearcherCurr, len(s.searchers)) | |||
| // get all searchers pointing at their first match | |||
| for i, searcher := range s.searchers { | |||
| curr, err := searcher.Next(ctx) | |||
| if err != nil { | |||
| return err | |||
| } | |||
| if curr != nil { | |||
| block[i].searcher = searcher | |||
| block[i].curr = curr | |||
| heap.Push(s, &block[i]) | |||
| } | |||
| } | |||
| err := s.updateMatches() | |||
| if err != nil { | |||
| return err | |||
| } | |||
| s.initialized = true | |||
| return nil | |||
| } | |||
| func (s *DisjunctionHeapSearcher) updateMatches() error { | |||
| matching := s.matching[:0] | |||
| matchingCurrs := s.matchingCurrs[:0] | |||
| if len(s.heap) > 0 { | |||
| // top of the heap is our next hit | |||
| next := heap.Pop(s).(*SearcherCurr) | |||
| matching = append(matching, next.curr) | |||
| matchingCurrs = append(matchingCurrs, next) | |||
| // now as long as top of heap matches, keep popping | |||
| for len(s.heap) > 0 && bytes.Compare(next.curr.IndexInternalID, s.heap[0].curr.IndexInternalID) == 0 { | |||
| next = heap.Pop(s).(*SearcherCurr) | |||
| matching = append(matching, next.curr) | |||
| matchingCurrs = append(matchingCurrs, next) | |||
| } | |||
| } | |||
| s.matching = matching | |||
| s.matchingCurrs = matchingCurrs | |||
| return nil | |||
| } | |||
| func (s *DisjunctionHeapSearcher) Weight() float64 { | |||
| var rv float64 | |||
| for _, searcher := range s.searchers { | |||
| rv += searcher.Weight() | |||
| } | |||
| return rv | |||
| } | |||
| func (s *DisjunctionHeapSearcher) SetQueryNorm(qnorm float64) { | |||
| for _, searcher := range s.searchers { | |||
| searcher.SetQueryNorm(qnorm) | |||
| } | |||
| } | |||
| func (s *DisjunctionHeapSearcher) Next(ctx *search.SearchContext) ( | |||
| *search.DocumentMatch, error) { | |||
| if !s.initialized { | |||
| err := s.initSearchers(ctx) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| } | |||
| var rv *search.DocumentMatch | |||
| found := false | |||
| for !found && len(s.matching) > 0 { | |||
| if len(s.matching) >= s.min { | |||
| found = true | |||
| // score this match | |||
| rv = s.scorer.Score(ctx, s.matching, len(s.matching), s.numSearchers) | |||
| } | |||
| // invoke next on all the matching searchers | |||
| for _, matchingCurr := range s.matchingCurrs { | |||
| if matchingCurr.curr != rv { | |||
| ctx.DocumentMatchPool.Put(matchingCurr.curr) | |||
| } | |||
| curr, err := matchingCurr.searcher.Next(ctx) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| if curr != nil { | |||
| matchingCurr.curr = curr | |||
| heap.Push(s, matchingCurr) | |||
| } | |||
| } | |||
| err := s.updateMatches() | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| } | |||
| return rv, nil | |||
| } | |||
| func (s *DisjunctionHeapSearcher) Advance(ctx *search.SearchContext, | |||
| ID index.IndexInternalID) (*search.DocumentMatch, error) { | |||
| if !s.initialized { | |||
| err := s.initSearchers(ctx) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| } | |||
| // if there is anything in matching, toss it back onto the heap | |||
| for _, matchingCurr := range s.matchingCurrs { | |||
| heap.Push(s, matchingCurr) | |||
| } | |||
| s.matching = s.matching[:0] | |||
| s.matchingCurrs = s.matchingCurrs[:0] | |||
| // find all searchers that actually need to be advanced | |||
| // advance them, using s.matchingCurrs as temp storage | |||
| for len(s.heap) > 0 && bytes.Compare(s.heap[0].curr.IndexInternalID, ID) < 0 { | |||
| searcherCurr := heap.Pop(s).(*SearcherCurr) | |||
| ctx.DocumentMatchPool.Put(searcherCurr.curr) | |||
| curr, err := searcherCurr.searcher.Advance(ctx, ID) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| if curr != nil { | |||
| searcherCurr.curr = curr | |||
| s.matchingCurrs = append(s.matchingCurrs, searcherCurr) | |||
| } | |||
| } | |||
| // now all of the searchers that we advanced have to be pushed back | |||
| for _, matchingCurr := range s.matchingCurrs { | |||
| heap.Push(s, matchingCurr) | |||
| } | |||
| // reset our temp space | |||
| s.matchingCurrs = s.matchingCurrs[:0] | |||
| err := s.updateMatches() | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| return s.Next(ctx) | |||
| } | |||
| func (s *DisjunctionHeapSearcher) Count() uint64 { | |||
| // for now return a worst case | |||
| var sum uint64 | |||
| for _, searcher := range s.searchers { | |||
| sum += searcher.Count() | |||
| } | |||
| return sum | |||
| } | |||
| func (s *DisjunctionHeapSearcher) Close() (rv error) { | |||
| for _, searcher := range s.searchers { | |||
| err := searcher.Close() | |||
| if err != nil && rv == nil { | |||
| rv = err | |||
| } | |||
| } | |||
| return rv | |||
| } | |||
| func (s *DisjunctionHeapSearcher) Min() int { | |||
| return s.min | |||
| } | |||
| func (s *DisjunctionHeapSearcher) DocumentMatchPoolSize() int { | |||
| rv := len(s.searchers) | |||
| for _, s := range s.searchers { | |||
| rv += s.DocumentMatchPoolSize() | |||
| } | |||
| return rv | |||
| } | |||
| // a disjunction searcher implements the index.Optimizable interface | |||
| // but only activates on an edge case where the disjunction is a | |||
| // wrapper around a single Optimizable child searcher | |||
| func (s *DisjunctionHeapSearcher) Optimize(kind string, octx index.OptimizableContext) ( | |||
| index.OptimizableContext, error) { | |||
| if len(s.searchers) == 1 { | |||
| o, ok := s.searchers[0].(index.Optimizable) | |||
| if ok { | |||
| return o.Optimize(kind, octx) | |||
| } | |||
| } | |||
| return octx, nil | |||
| } | |||
| // heap impl | |||
| func (s *DisjunctionHeapSearcher) Len() int { return len(s.heap) } | |||
| func (s *DisjunctionHeapSearcher) Less(i, j int) bool { | |||
| if s.heap[i].curr == nil { | |||
| return true | |||
| } else if s.heap[j].curr == nil { | |||
| return false | |||
| } | |||
| return bytes.Compare(s.heap[i].curr.IndexInternalID, s.heap[j].curr.IndexInternalID) < 0 | |||
| } | |||
| func (s *DisjunctionHeapSearcher) Swap(i, j int) { | |||
| s.heap[i], s.heap[j] = s.heap[j], s.heap[i] | |||
| } | |||
| func (s *DisjunctionHeapSearcher) Push(x interface{}) { | |||
| s.heap = append(s.heap, x.(*SearcherCurr)) | |||
| } | |||
| func (s *DisjunctionHeapSearcher) Pop() interface{} { | |||
| old := s.heap | |||
| n := len(old) | |||
| x := old[n-1] | |||
| s.heap = old[0 : n-1] | |||
| return x | |||
| } | |||
| @@ -0,0 +1,298 @@ | |||
| // Copyright (c) 2018 Couchbase, Inc. | |||
| // | |||
| // Licensed under the Apache License, Version 2.0 (the "License"); | |||
| // you may not use this file except in compliance with the License. | |||
| // You may obtain a copy of the License at | |||
| // | |||
| // http://www.apache.org/licenses/LICENSE-2.0 | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software | |||
| // distributed under the License is distributed on an "AS IS" BASIS, | |||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| // See the License for the specific language governing permissions and | |||
| // limitations under the License. | |||
| package searcher | |||
| import ( | |||
| "math" | |||
| "reflect" | |||
| "sort" | |||
| "github.com/blevesearch/bleve/index" | |||
| "github.com/blevesearch/bleve/search" | |||
| "github.com/blevesearch/bleve/search/scorer" | |||
| "github.com/blevesearch/bleve/size" | |||
| ) | |||
| var reflectStaticSizeDisjunctionSliceSearcher int | |||
| func init() { | |||
| var ds DisjunctionSliceSearcher | |||
| reflectStaticSizeDisjunctionSliceSearcher = int(reflect.TypeOf(ds).Size()) | |||
| } | |||
| type DisjunctionSliceSearcher struct { | |||
| indexReader index.IndexReader | |||
| searchers OrderedSearcherList | |||
| numSearchers int | |||
| queryNorm float64 | |||
| currs []*search.DocumentMatch | |||
| scorer *scorer.DisjunctionQueryScorer | |||
| min int | |||
| matching []*search.DocumentMatch | |||
| matchingIdxs []int | |||
| initialized bool | |||
| } | |||
| func newDisjunctionSliceSearcher(indexReader index.IndexReader, | |||
| qsearchers []search.Searcher, min float64, options search.SearcherOptions, | |||
| limit bool) ( | |||
| *DisjunctionSliceSearcher, error) { | |||
| if limit && tooManyClauses(len(qsearchers)) { | |||
| return nil, tooManyClausesErr(len(qsearchers)) | |||
| } | |||
| // build the downstream searchers | |||
| searchers := make(OrderedSearcherList, len(qsearchers)) | |||
| for i, searcher := range qsearchers { | |||
| searchers[i] = searcher | |||
| } | |||
| // sort the searchers | |||
| sort.Sort(sort.Reverse(searchers)) | |||
| // build our searcher | |||
| rv := DisjunctionSliceSearcher{ | |||
| indexReader: indexReader, | |||
| searchers: searchers, | |||
| numSearchers: len(searchers), | |||
| currs: make([]*search.DocumentMatch, len(searchers)), | |||
| scorer: scorer.NewDisjunctionQueryScorer(options), | |||
| min: int(min), | |||
| matching: make([]*search.DocumentMatch, len(searchers)), | |||
| matchingIdxs: make([]int, len(searchers)), | |||
| } | |||
| rv.computeQueryNorm() | |||
| return &rv, nil | |||
| } | |||
| func (s *DisjunctionSliceSearcher) Size() int { | |||
| sizeInBytes := reflectStaticSizeDisjunctionSliceSearcher + size.SizeOfPtr + | |||
| s.scorer.Size() | |||
| for _, entry := range s.searchers { | |||
| sizeInBytes += entry.Size() | |||
| } | |||
| for _, entry := range s.currs { | |||
| if entry != nil { | |||
| sizeInBytes += entry.Size() | |||
| } | |||
| } | |||
| for _, entry := range s.matching { | |||
| if entry != nil { | |||
| sizeInBytes += entry.Size() | |||
| } | |||
| } | |||
| sizeInBytes += len(s.matchingIdxs) * size.SizeOfInt | |||
| return sizeInBytes | |||
| } | |||
| func (s *DisjunctionSliceSearcher) computeQueryNorm() { | |||
| // first calculate sum of squared weights | |||
| sumOfSquaredWeights := 0.0 | |||
| for _, searcher := range s.searchers { | |||
| sumOfSquaredWeights += searcher.Weight() | |||
| } | |||
| // now compute query norm from this | |||
| s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights) | |||
| // finally tell all the downstream searchers the norm | |||
| for _, searcher := range s.searchers { | |||
| searcher.SetQueryNorm(s.queryNorm) | |||
| } | |||
| } | |||
| func (s *DisjunctionSliceSearcher) initSearchers(ctx *search.SearchContext) error { | |||
| var err error | |||
| // get all searchers pointing at their first match | |||
| for i, searcher := range s.searchers { | |||
| if s.currs[i] != nil { | |||
| ctx.DocumentMatchPool.Put(s.currs[i]) | |||
| } | |||
| s.currs[i], err = searcher.Next(ctx) | |||
| if err != nil { | |||
| return err | |||
| } | |||
| } | |||
| err = s.updateMatches() | |||
| if err != nil { | |||
| return err | |||
| } | |||
| s.initialized = true | |||
| return nil | |||
| } | |||
| func (s *DisjunctionSliceSearcher) updateMatches() error { | |||
| matching := s.matching[:0] | |||
| matchingIdxs := s.matchingIdxs[:0] | |||
| for i := 0; i < len(s.currs); i++ { | |||
| curr := s.currs[i] | |||
| if curr == nil { | |||
| continue | |||
| } | |||
| if len(matching) > 0 { | |||
| cmp := curr.IndexInternalID.Compare(matching[0].IndexInternalID) | |||
| if cmp > 0 { | |||
| continue | |||
| } | |||
| if cmp < 0 { | |||
| matching = matching[:0] | |||
| matchingIdxs = matchingIdxs[:0] | |||
| } | |||
| } | |||
| matching = append(matching, curr) | |||
| matchingIdxs = append(matchingIdxs, i) | |||
| } | |||
| s.matching = matching | |||
| s.matchingIdxs = matchingIdxs | |||
| return nil | |||
| } | |||
| func (s *DisjunctionSliceSearcher) Weight() float64 { | |||
| var rv float64 | |||
| for _, searcher := range s.searchers { | |||
| rv += searcher.Weight() | |||
| } | |||
| return rv | |||
| } | |||
| func (s *DisjunctionSliceSearcher) SetQueryNorm(qnorm float64) { | |||
| for _, searcher := range s.searchers { | |||
| searcher.SetQueryNorm(qnorm) | |||
| } | |||
| } | |||
| func (s *DisjunctionSliceSearcher) Next(ctx *search.SearchContext) ( | |||
| *search.DocumentMatch, error) { | |||
| if !s.initialized { | |||
| err := s.initSearchers(ctx) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| } | |||
| var err error | |||
| var rv *search.DocumentMatch | |||
| found := false | |||
| for !found && len(s.matching) > 0 { | |||
| if len(s.matching) >= s.min { | |||
| found = true | |||
| // score this match | |||
| rv = s.scorer.Score(ctx, s.matching, len(s.matching), s.numSearchers) | |||
| } | |||
| // invoke next on all the matching searchers | |||
| for _, i := range s.matchingIdxs { | |||
| searcher := s.searchers[i] | |||
| if s.currs[i] != rv { | |||
| ctx.DocumentMatchPool.Put(s.currs[i]) | |||
| } | |||
| s.currs[i], err = searcher.Next(ctx) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| } | |||
| err = s.updateMatches() | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| } | |||
| return rv, nil | |||
| } | |||
| func (s *DisjunctionSliceSearcher) Advance(ctx *search.SearchContext, | |||
| ID index.IndexInternalID) (*search.DocumentMatch, error) { | |||
| if !s.initialized { | |||
| err := s.initSearchers(ctx) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| } | |||
| // get all searchers pointing at their first match | |||
| var err error | |||
| for i, searcher := range s.searchers { | |||
| if s.currs[i] != nil { | |||
| if s.currs[i].IndexInternalID.Compare(ID) >= 0 { | |||
| continue | |||
| } | |||
| ctx.DocumentMatchPool.Put(s.currs[i]) | |||
| } | |||
| s.currs[i], err = searcher.Advance(ctx, ID) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| } | |||
| err = s.updateMatches() | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| return s.Next(ctx) | |||
| } | |||
| func (s *DisjunctionSliceSearcher) Count() uint64 { | |||
| // for now return a worst case | |||
| var sum uint64 | |||
| for _, searcher := range s.searchers { | |||
| sum += searcher.Count() | |||
| } | |||
| return sum | |||
| } | |||
| func (s *DisjunctionSliceSearcher) Close() (rv error) { | |||
| for _, searcher := range s.searchers { | |||
| err := searcher.Close() | |||
| if err != nil && rv == nil { | |||
| rv = err | |||
| } | |||
| } | |||
| return rv | |||
| } | |||
| func (s *DisjunctionSliceSearcher) Min() int { | |||
| return s.min | |||
| } | |||
| func (s *DisjunctionSliceSearcher) DocumentMatchPoolSize() int { | |||
| rv := len(s.currs) | |||
| for _, s := range s.searchers { | |||
| rv += s.DocumentMatchPoolSize() | |||
| } | |||
| return rv | |||
| } | |||
| // a disjunction searcher implements the index.Optimizable interface | |||
| // but only activates on an edge case where the disjunction is a | |||
| // wrapper around a single Optimizable child searcher | |||
| func (s *DisjunctionSliceSearcher) Optimize(kind string, octx index.OptimizableContext) ( | |||
| index.OptimizableContext, error) { | |||
| if len(s.searchers) == 1 { | |||
| o, ok := s.searchers[0].(index.Optimizable) | |||
| if ok { | |||
| return o.Optimize(kind, octx) | |||
| } | |||
| } | |||
| return octx, nil | |||
| } | |||
| @@ -15,11 +15,21 @@ | |||
| package searcher | |||
| import ( | |||
| "reflect" | |||
| "github.com/blevesearch/bleve/index" | |||
| "github.com/blevesearch/bleve/search" | |||
| "github.com/blevesearch/bleve/search/scorer" | |||
| "github.com/blevesearch/bleve/size" | |||
| ) | |||
| var reflectStaticSizeDocIDSearcher int | |||
| func init() { | |||
| var ds DocIDSearcher | |||
| reflectStaticSizeDocIDSearcher = int(reflect.TypeOf(ds).Size()) | |||
| } | |||
| // DocIDSearcher returns documents matching a predefined set of identifiers. | |||
| type DocIDSearcher struct { | |||
| reader index.DocIDReader | |||
| @@ -42,6 +52,12 @@ func NewDocIDSearcher(indexReader index.IndexReader, ids []string, boost float64 | |||
| }, nil | |||
| } | |||
| func (s *DocIDSearcher) Size() int { | |||
| return reflectStaticSizeDocIDSearcher + size.SizeOfPtr + | |||
| s.reader.Size() + | |||
| s.scorer.Size() | |||
| } | |||
| func (s *DocIDSearcher) Count() uint64 { | |||
| return uint64(s.count) | |||
| } | |||
| @@ -15,10 +15,20 @@ | |||
| package searcher | |||
| import ( | |||
| "reflect" | |||
| "github.com/blevesearch/bleve/index" | |||
| "github.com/blevesearch/bleve/search" | |||
| "github.com/blevesearch/bleve/size" | |||
| ) | |||
| var reflectStaticSizeFilteringSearcher int | |||
| func init() { | |||
| var fs FilteringSearcher | |||
| reflectStaticSizeFilteringSearcher = int(reflect.TypeOf(fs).Size()) | |||
| } | |||
| // FilterFunc defines a function which can filter documents | |||
| // returning true means keep the document | |||
| // returning false means do not keep the document | |||
| @@ -38,6 +48,11 @@ func NewFilteringSearcher(s search.Searcher, filter FilterFunc) *FilteringSearch | |||
| } | |||
| } | |||
| func (f *FilteringSearcher) Size() int { | |||
| return reflectStaticSizeFilteringSearcher + size.SizeOfPtr + | |||
| f.child.Size() | |||
| } | |||
| func (f *FilteringSearcher) Next(ctx *search.SearchContext) (*search.DocumentMatch, error) { | |||
| next, err := f.child.Next(ctx) | |||
| for next != nil && err == nil { | |||
| @@ -15,13 +15,26 @@ | |||
| package searcher | |||
| import ( | |||
| "fmt" | |||
| "github.com/blevesearch/bleve/index" | |||
| "github.com/blevesearch/bleve/search" | |||
| ) | |||
| var MaxFuzziness = 2 | |||
| func NewFuzzySearcher(indexReader index.IndexReader, term string, | |||
| prefix, fuzziness int, field string, boost float64, | |||
| options search.SearcherOptions) (search.Searcher, error) { | |||
| if fuzziness > MaxFuzziness { | |||
| return nil, fmt.Errorf("fuzziness exceeds max (%d)", MaxFuzziness) | |||
| } | |||
| if fuzziness < 0 { | |||
| return nil, fmt.Errorf("invalid fuzziness, negative") | |||
| } | |||
| // Note: we don't byte slice the term for a prefix because of runes. | |||
| prefixTerm := "" | |||
| for i, r := range term { | |||
| @@ -31,7 +44,6 @@ func NewFuzzySearcher(indexReader index.IndexReader, term string, | |||
| break | |||
| } | |||
| } | |||
| candidateTerms, err := findFuzzyCandidateTerms(indexReader, term, fuzziness, | |||
| field, prefixTerm) | |||
| if err != nil { | |||
| @@ -45,12 +57,40 @@ func NewFuzzySearcher(indexReader index.IndexReader, term string, | |||
| func findFuzzyCandidateTerms(indexReader index.IndexReader, term string, | |||
| fuzziness int, field, prefixTerm string) (rv []string, err error) { | |||
| rv = make([]string, 0) | |||
| // in case of advanced reader implementations directly call | |||
| // the levenshtein automaton based iterator to collect the | |||
| // candidate terms | |||
| if ir, ok := indexReader.(index.IndexReaderFuzzy); ok { | |||
| fieldDict, err := ir.FieldDictFuzzy(field, term, fuzziness, prefixTerm) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| defer func() { | |||
| if cerr := fieldDict.Close(); cerr != nil && err == nil { | |||
| err = cerr | |||
| } | |||
| }() | |||
| tfd, err := fieldDict.Next() | |||
| for err == nil && tfd != nil { | |||
| rv = append(rv, tfd.Term) | |||
| if tooManyClauses(len(rv)) { | |||
| return nil, tooManyClausesErr(len(rv)) | |||
| } | |||
| tfd, err = fieldDict.Next() | |||
| } | |||
| return rv, err | |||
| } | |||
| var fieldDict index.FieldDict | |||
| if len(prefixTerm) > 0 { | |||
| fieldDict, err = indexReader.FieldDictPrefix(field, []byte(prefixTerm)) | |||
| } else { | |||
| fieldDict, err = indexReader.FieldDict(field) | |||
| } | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| defer func() { | |||
| if cerr := fieldDict.Close(); cerr != nil && err == nil { | |||
| err = cerr | |||
| @@ -58,13 +98,16 @@ func findFuzzyCandidateTerms(indexReader index.IndexReader, term string, | |||
| }() | |||
| // enumerate terms and check levenshtein distance | |||
| var reuse []int | |||
| tfd, err := fieldDict.Next() | |||
| for err == nil && tfd != nil { | |||
| ld, exceeded := search.LevenshteinDistanceMax(term, tfd.Term, fuzziness) | |||
| var ld int | |||
| var exceeded bool | |||
| ld, exceeded, reuse = search.LevenshteinDistanceMaxReuseSlice(term, tfd.Term, fuzziness, reuse) | |||
| if !exceeded && ld <= fuzziness { | |||
| rv = append(rv, tfd.Term) | |||
| if tooManyClauses(len(rv)) { | |||
| return rv, tooManyClausesErr() | |||
| return nil, tooManyClausesErr(len(rv)) | |||
| } | |||
| } | |||
| tfd, err = fieldDict.Next() | |||