Monorepo for Tangled
tangled.org
1// heavily inspired by gitea's model (basically copy-pasted)
2package repos_indexer
3
4import (
5 "context"
6 "errors"
7 "log"
8 "os"
9 "time"
10
11 "github.com/blevesearch/bleve/v2"
12 "github.com/blevesearch/bleve/v2/analysis/analyzer/custom"
13 "github.com/blevesearch/bleve/v2/analysis/token/camelcase"
14 "github.com/blevesearch/bleve/v2/analysis/token/lowercase"
15 "github.com/blevesearch/bleve/v2/analysis/token/ngram"
16 "github.com/blevesearch/bleve/v2/analysis/token/unicodenorm"
17 "github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
18 "github.com/blevesearch/bleve/v2/index/upsidedown"
19 "github.com/blevesearch/bleve/v2/mapping"
20 "github.com/blevesearch/bleve/v2/search"
21 "github.com/blevesearch/bleve/v2/search/query"
22 "tangled.org/core/appview/db"
23 "tangled.org/core/appview/indexer/base36"
24 bleveutil "tangled.org/core/appview/indexer/bleve"
25 "tangled.org/core/appview/models"
26 "tangled.org/core/appview/pagination"
27 tlog "tangled.org/core/log"
28)
29
30const (
31 repoIndexerAnalyzer = "repoIndexer"
32 repoIndexerDocType = "repoIndexerDocType"
33
34 unicodeNormalizeName = "unicodeNormalize"
35
36 // Bump this when the index mapping changes to trigger a rebuild.
37 repoIndexerVersion = 7
38)
39
40type Indexer struct {
41 indexer bleve.Index
42 path string
43}
44
45func NewIndexer(indexDir string) *Indexer {
46 return &Indexer{
47 path: indexDir,
48 }
49}
50
51// Init initializes the indexer
52func (ix *Indexer) Init(ctx context.Context, e db.Execer) {
53 l := tlog.FromContext(ctx)
54 existed, err := ix.initialize(ctx)
55 if err != nil {
56 log.Fatalln("failed to initialize repo indexer", err)
57 }
58 if !existed {
59 l.Debug("Populating the repo indexer")
60 err := PopulateIndexer(ctx, ix, e)
61 if err != nil {
62 log.Fatalln("failed to populate repo indexer", err)
63 }
64 }
65
66 count, _ := ix.indexer.DocCount()
67 l.Info("Initialized the repo indexer", "docCount", count)
68}
69
70func generateRepoIndexMapping() (mapping.IndexMapping, error) {
71 mapping := bleve.NewIndexMapping()
72 docMapping := bleve.NewDocumentMapping()
73
74 textFieldMapping := bleve.NewTextFieldMapping()
75 textFieldMapping.Store = false
76 textFieldMapping.IncludeInAll = false
77
78 keywordFieldMapping := bleve.NewKeywordFieldMapping()
79 keywordFieldMapping.Store = false
80 keywordFieldMapping.IncludeInAll = false
81
82 // case-insensitive keyword field for language and topics
83 caseInsensitiveKeywordMapping := bleve.NewTextFieldMapping()
84 caseInsensitiveKeywordMapping.Store = false
85 caseInsensitiveKeywordMapping.IncludeInAll = false
86 caseInsensitiveKeywordMapping.Analyzer = "keyword_lowercase"
87
88 // trigram field for partial repo name matching
89 trigramFieldMapping := bleve.NewTextFieldMapping()
90 trigramFieldMapping.Store = false
91 trigramFieldMapping.IncludeInAll = false
92 trigramFieldMapping.Analyzer = "trigram"
93
94 // numeric field mapping for sorting by counts
95 numericFieldMapping := bleve.NewNumericFieldMapping()
96 numericFieldMapping.Store = false
97 numericFieldMapping.IncludeInAll = false
98 numericFieldMapping.DocValues = true // required for sorting
99
100 // datetime field mapping for sorting by creation date
101 dateFieldMapping := bleve.NewDateTimeFieldMapping()
102 dateFieldMapping.Store = false
103 dateFieldMapping.IncludeInAll = false
104 dateFieldMapping.DocValues = true // required for sorting
105
106 // boolean field mapping for fork detection
107 booleanFieldMapping := bleve.NewBooleanFieldMapping()
108 booleanFieldMapping.Store = false
109 booleanFieldMapping.IncludeInAll = false
110
111 // text fields
112 docMapping.AddFieldMappingsAt("name", textFieldMapping)
113 docMapping.AddFieldMappingsAt("name_trigram", trigramFieldMapping)
114 docMapping.AddFieldMappingsAt("description", textFieldMapping)
115 docMapping.AddFieldMappingsAt("website", textFieldMapping)
116 docMapping.AddFieldMappingsAt("topics", textFieldMapping)
117
118 // keyword fields
119 docMapping.AddFieldMappingsAt("language", caseInsensitiveKeywordMapping)
120 docMapping.AddFieldMappingsAt("topics_exact", caseInsensitiveKeywordMapping)
121 docMapping.AddFieldMappingsAt("did", keywordFieldMapping)
122 docMapping.AddFieldMappingsAt("knot", keywordFieldMapping)
123 docMapping.AddFieldMappingsAt("repo_did", keywordFieldMapping)
124
125 // fork indicator for down-ranking
126 docMapping.AddFieldMappingsAt("is_fork", booleanFieldMapping)
127
128 // sortable numeric fields
129 docMapping.AddFieldMappingsAt("star_count", numericFieldMapping)
130 docMapping.AddFieldMappingsAt("issue_count", numericFieldMapping)
131 docMapping.AddFieldMappingsAt("pull_count", numericFieldMapping)
132
133 // sortable date field
134 docMapping.AddFieldMappingsAt("created", dateFieldMapping)
135
136 err := mapping.AddCustomTokenFilter(unicodeNormalizeName, map[string]any{
137 "type": unicodenorm.Name,
138 "form": unicodenorm.NFC,
139 })
140 if err != nil {
141 return nil, err
142 }
143
144 err = mapping.AddCustomTokenFilter("edgeNgram3", map[string]any{
145 "type": ngram.Name,
146 "min": 2.0,
147 "max": 3.0,
148 })
149 if err != nil {
150 return nil, err
151 }
152
153 err = mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]any{
154 "type": custom.Name,
155 "char_filters": []string{},
156 "tokenizer": unicode.Name,
157 "token_filters": []string{unicodeNormalizeName, camelcase.Name, lowercase.Name},
158 })
159 if err != nil {
160 return nil, err
161 }
162
163 err = mapping.AddCustomAnalyzer("keyword_lowercase", map[string]any{
164 "type": custom.Name,
165 "char_filters": []string{},
166 "tokenizer": "single",
167 "token_filters": []string{lowercase.Name},
168 })
169 if err != nil {
170 return nil, err
171 }
172
173 err = mapping.AddCustomAnalyzer("trigram", map[string]any{
174 "type": custom.Name,
175 "char_filters": []string{},
176 "tokenizer": "single",
177 "token_filters": []string{lowercase.Name, "edgeNgram3"},
178 })
179 if err != nil {
180 return nil, err
181 }
182
183 mapping.DefaultAnalyzer = repoIndexerAnalyzer
184 mapping.AddDocumentMapping(repoIndexerDocType, docMapping)
185 mapping.AddDocumentMapping("_all", bleve.NewDocumentDisabledMapping())
186 mapping.DefaultMapping = bleve.NewDocumentDisabledMapping()
187
188 return mapping, nil
189}
190
191func (ix *Indexer) initialize(ctx context.Context) (bool, error) {
192 if ix.indexer != nil {
193 return false, errors.New("indexer is already initialized")
194 }
195
196 indexer, err := openIndexer(ctx, ix.path, repoIndexerVersion)
197 if err != nil {
198 return false, err
199 }
200 if indexer != nil {
201 ix.indexer = indexer
202 return true, nil
203 }
204
205 mapping, err := generateRepoIndexMapping()
206 if err != nil {
207 return false, err
208 }
209 indexer, err = bleve.New(ix.path, mapping)
210 if err != nil {
211 return false, err
212 }
213 indexer.SetInternal([]byte("mapping_version"), []byte{byte(repoIndexerVersion)})
214
215 ix.indexer = indexer
216
217 return false, nil
218}
219
220func openIndexer(ctx context.Context, path string, version int) (bleve.Index, error) {
221 l := tlog.FromContext(ctx)
222 indexer, err := bleve.Open(path)
223 if err != nil {
224 if errors.Is(err, upsidedown.IncompatibleVersion) {
225 l.Info("Indexer was built with a previous version of bleve, deleting and rebuilding")
226 return nil, os.RemoveAll(path)
227 }
228 return nil, nil
229 }
230
231 storedVersion, _ := indexer.GetInternal([]byte("mapping_version"))
232 if storedVersion == nil || int(storedVersion[0]) != version {
233 l.Info("Indexer mapping version changed, deleting and rebuilding")
234 indexer.Close()
235 return nil, os.RemoveAll(path)
236 }
237
238 return indexer, nil
239}
240
241func PopulateIndexer(ctx context.Context, ix *Indexer, e db.Execer) error {
242 l := tlog.FromContext(ctx)
243 count := 0
244
245 err := pagination.IterateAll(
246 func(page pagination.Page) ([]models.Repo, error) {
247 return db.GetReposPaginated(e, page)
248 },
249 func(repos []models.Repo) error {
250 count += len(repos)
251 return ix.Index(ctx, repos...)
252 },
253 )
254
255 l.Info("repos indexed", "count", count)
256 return err
257}
258
259type repoData struct {
260 ID int64 `json:"id"`
261 RepoDid string `json:"repo_did"`
262 Did string `json:"did"`
263 Name string `json:"name"`
264 NameTrigram string `json:"name_trigram"`
265 Description string `json:"description"`
266 Website string `json:"website"`
267 Topics []string `json:"topics"`
268 TopicsExact []string `json:"topics_exact"`
269 Knot string `json:"knot"`
270 Language string `json:"language"`
271 IsFork bool `json:"is_fork"`
272
273 // sortable fields
274 StarCount int `json:"star_count"`
275 IssueCount int `json:"issue_count"`
276 PullCount int `json:"pull_count"`
277 Created time.Time `json:"created"`
278}
279
280func makeRepoData(repo *models.Repo) *repoData {
281 var language string
282 var starCount, issueCount, pullCount int
283
284 if repo.RepoStats != nil {
285 language = repo.RepoStats.Language
286 starCount = repo.RepoStats.StarCount
287 issueCount = repo.RepoStats.IssueCount.Open + repo.RepoStats.IssueCount.Closed
288 pullCount = repo.RepoStats.PullCount.Open +
289 repo.RepoStats.PullCount.Merged +
290 repo.RepoStats.PullCount.Closed
291 }
292
293 isFork := repo.Source != ""
294
295 return &repoData{
296 ID: repo.Id,
297 RepoDid: repo.RepoDid,
298 Did: repo.Did,
299 Name: repo.Name,
300 NameTrigram: repo.Name,
301 Description: repo.Description,
302 Website: repo.Website,
303 Topics: repo.Topics,
304 TopicsExact: repo.Topics,
305 Knot: repo.Knot,
306 Language: language,
307 IsFork: isFork,
308 StarCount: starCount,
309 IssueCount: issueCount,
310 PullCount: pullCount,
311 Created: repo.Created,
312 }
313}
314
315// Type returns the document type, for bleve's mapping.Classifier interface.
316func (r *repoData) Type() string {
317 return repoIndexerDocType
318}
319
320type SearchResult struct {
321 Hits []int64
322 Total uint64
323 Duration time.Duration
324}
325
326const maxBatchSize = 20
327
328func (ix *Indexer) Index(ctx context.Context, repos ...models.Repo) error {
329 batch := bleveutil.NewFlushingBatch(ix.indexer, maxBatchSize)
330 for _, repo := range repos {
331 repoData := makeRepoData(&repo)
332 if err := batch.Index(base36.Encode(repo.Id), repoData); err != nil {
333 return err
334 }
335 }
336 return batch.Flush()
337}
338
339func (ix *Indexer) Delete(ctx context.Context, repoID int64) error {
340 return ix.indexer.Delete(base36.Encode(repoID))
341}
342
343func (ix *Indexer) TotalDocCount() (uint64, error) {
344 return ix.indexer.DocCount()
345}
346
347func (ix *Indexer) Search(ctx context.Context, opts models.RepoSearchOptions) (*SearchResult, error) {
348 var musts []query.Query
349 var mustNots []query.Query
350
351 for _, keyword := range opts.Keywords {
352 musts = append(musts, bleve.NewDisjunctionQuery(
353 bleveutil.MatchAndQuery("name", keyword, repoIndexerAnalyzer, 0),
354 bleveutil.MatchAndQuery("name_trigram", keyword, "trigram", 0),
355 bleveutil.MatchAndQuery("description", keyword, repoIndexerAnalyzer, 0),
356 bleveutil.MatchAndQuery("website", keyword, repoIndexerAnalyzer, 0),
357 bleveutil.MatchAndQuery("topics", keyword, repoIndexerAnalyzer, 0),
358 ))
359 }
360
361 for _, phrase := range opts.Phrases {
362 musts = append(musts, bleve.NewDisjunctionQuery(
363 bleveutil.MatchPhraseQuery("name", phrase, repoIndexerAnalyzer),
364 bleveutil.MatchPhraseQuery("description", phrase, repoIndexerAnalyzer),
365 bleveutil.MatchPhraseQuery("website", phrase, repoIndexerAnalyzer),
366 bleveutil.MatchPhraseQuery("topics", phrase, repoIndexerAnalyzer),
367 ))
368 }
369
370 for _, keyword := range opts.NegatedKeywords {
371 mustNots = append(mustNots, bleve.NewDisjunctionQuery(
372 bleveutil.MatchAndQuery("name", keyword, repoIndexerAnalyzer, 0),
373 bleveutil.MatchAndQuery("description", keyword, repoIndexerAnalyzer, 0),
374 bleveutil.MatchAndQuery("website", keyword, repoIndexerAnalyzer, 0),
375 bleveutil.MatchAndQuery("topics", keyword, repoIndexerAnalyzer, 0),
376 ))
377 }
378
379 for _, phrase := range opts.NegatedPhrases {
380 mustNots = append(mustNots, bleve.NewDisjunctionQuery(
381 bleveutil.MatchPhraseQuery("name", phrase, repoIndexerAnalyzer),
382 bleveutil.MatchPhraseQuery("description", phrase, repoIndexerAnalyzer),
383 bleveutil.MatchPhraseQuery("website", phrase, repoIndexerAnalyzer),
384 bleveutil.MatchPhraseQuery("topics", phrase, repoIndexerAnalyzer),
385 ))
386 }
387
388 // keyword filters
389 if opts.Language != "" {
390 musts = append(musts, bleveutil.MatchAndQuery("language", opts.Language, "keyword_lowercase", 0))
391 }
392
393 if opts.Knot != "" {
394 musts = append(musts, bleveutil.KeywordFieldQuery("knot", opts.Knot))
395 }
396
397 if opts.Did != "" {
398 musts = append(musts, bleveutil.KeywordFieldQuery("did", opts.Did))
399 }
400
401 for _, topic := range opts.Topics {
402 musts = append(musts, bleveutil.MatchAndQuery("topics_exact", topic, "keyword_lowercase", 0))
403 }
404
405 for _, topic := range opts.NegatedTopics {
406 mustNots = append(mustNots, bleveutil.MatchAndQuery("topics_exact", topic, "keyword_lowercase", 0))
407 }
408
409 indexerQuery := bleve.NewBooleanQuery()
410 if len(musts) == 0 {
411 musts = append(musts, bleve.NewMatchAllQuery())
412 }
413 indexerQuery.AddMust(musts...)
414 indexerQuery.AddMustNot(mustNots...)
415
416 // use a disjunction where:
417 // - repos with more stars get higher boost
418 // - non-forks get a boost
419 // - boosts stack
420 finalQuery := bleve.NewDisjunctionQuery()
421
422 // add the main query
423 finalQuery.AddQuery(indexerQuery)
424
425 // add a boosted query for non-forks
426 notForkQuery := bleve.NewBooleanQuery()
427 notForkQuery.AddMust(indexerQuery)
428 isForkQuery := bleve.NewBoolFieldQuery(true)
429 isForkQuery.SetField("is_fork")
430 notForkQuery.AddMustNot(isForkQuery)
431 notForkQuery.SetBoost(2.0)
432 finalQuery.AddQuery(notForkQuery)
433
434 // add boosted queries for repos with more stars
435 // 10-99 stars
436 tier2Query := bleve.NewBooleanQuery()
437 tier2Query.AddMust(indexerQuery)
438 min10 := float64(10)
439 max99 := float64(99)
440 starRange2 := bleveutil.NumericRangeQuery("star_count", &min10, &max99)
441 tier2Query.AddMust(starRange2)
442 tier2Query.SetBoost(1.5)
443 finalQuery.AddQuery(tier2Query)
444
445 // 100-999 stars
446 tier3Query := bleve.NewBooleanQuery()
447 tier3Query.AddMust(indexerQuery)
448 min100 := float64(100)
449 max999 := float64(999)
450 starRange3 := bleveutil.NumericRangeQuery("star_count", &min100, &max999)
451 tier3Query.AddMust(starRange3)
452 tier3Query.SetBoost(2.5)
453 finalQuery.AddQuery(tier3Query)
454
455 // 1000+ stars
456 tier4Query := bleve.NewBooleanQuery()
457 tier4Query.AddMust(indexerQuery)
458 min1000 := float64(1000)
459 starRange4 := bleveutil.NumericRangeQuery("star_count", &min1000, nil)
460 tier4Query.AddMust(starRange4)
461 tier4Query.SetBoost(4.0)
462 finalQuery.AddQuery(tier4Query)
463
464 // use minimum of 1 to ensure all results match at least one clause
465 finalQuery.SetMin(1)
466
467 searchReq := bleve.NewSearchRequestOptions(finalQuery, opts.Page.Limit, opts.Page.Offset, false)
468
469 if opts.SortField != "" && opts.SortField != "relevance" {
470 var sortField string
471
472 switch opts.SortField {
473 case "created":
474 sortField = "created"
475 case "stars":
476 sortField = "star_count"
477 case "issues":
478 sortField = "issue_count"
479 case "pulls":
480 sortField = "pull_count"
481 default:
482 // invalid field, fall back to relevance
483 sortField = ""
484 }
485
486 if sortField != "" {
487 searchReq.SortByCustom(search.SortOrder{
488 &search.SortField{
489 Field: sortField,
490 Desc: opts.SortDesc,
491 },
492 })
493 }
494 }
495
496 res, err := ix.indexer.SearchInContext(ctx, searchReq)
497 if err != nil {
498 return nil, err
499 }
500 ret := &SearchResult{
501 Total: res.Total,
502 Duration: res.Took,
503 Hits: make([]int64, len(res.Hits)),
504 }
505 for i, hit := range res.Hits {
506 id, err := base36.Decode(hit.ID)
507 if err != nil {
508 return nil, err
509 }
510 ret.Hits[i] = id
511 }
512 return ret, nil
513}