Monorepo for Tangled tangled.org
2

Configure Feed

Select the types of activity you want to include in your feed.

1// heavily inspired by gitea's model (basically copy-pasted) 2package repos_indexer 3 4import ( 5 "context" 6 "errors" 7 "log" 8 "os" 9 "time" 10 11 "github.com/blevesearch/bleve/v2" 12 "github.com/blevesearch/bleve/v2/analysis/analyzer/custom" 13 "github.com/blevesearch/bleve/v2/analysis/token/camelcase" 14 "github.com/blevesearch/bleve/v2/analysis/token/lowercase" 15 "github.com/blevesearch/bleve/v2/analysis/token/ngram" 16 "github.com/blevesearch/bleve/v2/analysis/token/unicodenorm" 17 "github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode" 18 "github.com/blevesearch/bleve/v2/index/upsidedown" 19 "github.com/blevesearch/bleve/v2/mapping" 20 "github.com/blevesearch/bleve/v2/search" 21 "github.com/blevesearch/bleve/v2/search/query" 22 "tangled.org/core/appview/db" 23 "tangled.org/core/appview/indexer/base36" 24 bleveutil "tangled.org/core/appview/indexer/bleve" 25 "tangled.org/core/appview/models" 26 "tangled.org/core/appview/pagination" 27 tlog "tangled.org/core/log" 28) 29 30const ( 31 repoIndexerAnalyzer = "repoIndexer" 32 repoIndexerDocType = "repoIndexerDocType" 33 34 unicodeNormalizeName = "unicodeNormalize" 35 36 // Bump this when the index mapping changes to trigger a rebuild. 37 repoIndexerVersion = 7 38) 39 40type Indexer struct { 41 indexer bleve.Index 42 path string 43} 44 45func NewIndexer(indexDir string) *Indexer { 46 return &Indexer{ 47 path: indexDir, 48 } 49} 50 51// Init initializes the indexer 52func (ix *Indexer) Init(ctx context.Context, e db.Execer) { 53 l := tlog.FromContext(ctx) 54 existed, err := ix.initialize(ctx) 55 if err != nil { 56 log.Fatalln("failed to initialize repo indexer", err) 57 } 58 if !existed { 59 l.Debug("Populating the repo indexer") 60 err := PopulateIndexer(ctx, ix, e) 61 if err != nil { 62 log.Fatalln("failed to populate repo indexer", err) 63 } 64 } 65 66 count, _ := ix.indexer.DocCount() 67 l.Info("Initialized the repo indexer", "docCount", count) 68} 69 70func generateRepoIndexMapping() (mapping.IndexMapping, error) { 71 mapping := bleve.NewIndexMapping() 72 docMapping := bleve.NewDocumentMapping() 73 74 textFieldMapping := bleve.NewTextFieldMapping() 75 textFieldMapping.Store = false 76 textFieldMapping.IncludeInAll = false 77 78 keywordFieldMapping := bleve.NewKeywordFieldMapping() 79 keywordFieldMapping.Store = false 80 keywordFieldMapping.IncludeInAll = false 81 82 // case-insensitive keyword field for language and topics 83 caseInsensitiveKeywordMapping := bleve.NewTextFieldMapping() 84 caseInsensitiveKeywordMapping.Store = false 85 caseInsensitiveKeywordMapping.IncludeInAll = false 86 caseInsensitiveKeywordMapping.Analyzer = "keyword_lowercase" 87 88 // trigram field for partial repo name matching 89 trigramFieldMapping := bleve.NewTextFieldMapping() 90 trigramFieldMapping.Store = false 91 trigramFieldMapping.IncludeInAll = false 92 trigramFieldMapping.Analyzer = "trigram" 93 94 // numeric field mapping for sorting by counts 95 numericFieldMapping := bleve.NewNumericFieldMapping() 96 numericFieldMapping.Store = false 97 numericFieldMapping.IncludeInAll = false 98 numericFieldMapping.DocValues = true // required for sorting 99 100 // datetime field mapping for sorting by creation date 101 dateFieldMapping := bleve.NewDateTimeFieldMapping() 102 dateFieldMapping.Store = false 103 dateFieldMapping.IncludeInAll = false 104 dateFieldMapping.DocValues = true // required for sorting 105 106 // boolean field mapping for fork detection 107 booleanFieldMapping := bleve.NewBooleanFieldMapping() 108 booleanFieldMapping.Store = false 109 booleanFieldMapping.IncludeInAll = false 110 111 // text fields 112 docMapping.AddFieldMappingsAt("name", textFieldMapping) 113 docMapping.AddFieldMappingsAt("name_trigram", trigramFieldMapping) 114 docMapping.AddFieldMappingsAt("description", textFieldMapping) 115 docMapping.AddFieldMappingsAt("website", textFieldMapping) 116 docMapping.AddFieldMappingsAt("topics", textFieldMapping) 117 118 // keyword fields 119 docMapping.AddFieldMappingsAt("language", caseInsensitiveKeywordMapping) 120 docMapping.AddFieldMappingsAt("topics_exact", caseInsensitiveKeywordMapping) 121 docMapping.AddFieldMappingsAt("did", keywordFieldMapping) 122 docMapping.AddFieldMappingsAt("knot", keywordFieldMapping) 123 docMapping.AddFieldMappingsAt("repo_did", keywordFieldMapping) 124 125 // fork indicator for down-ranking 126 docMapping.AddFieldMappingsAt("is_fork", booleanFieldMapping) 127 128 // sortable numeric fields 129 docMapping.AddFieldMappingsAt("star_count", numericFieldMapping) 130 docMapping.AddFieldMappingsAt("issue_count", numericFieldMapping) 131 docMapping.AddFieldMappingsAt("pull_count", numericFieldMapping) 132 133 // sortable date field 134 docMapping.AddFieldMappingsAt("created", dateFieldMapping) 135 136 err := mapping.AddCustomTokenFilter(unicodeNormalizeName, map[string]any{ 137 "type": unicodenorm.Name, 138 "form": unicodenorm.NFC, 139 }) 140 if err != nil { 141 return nil, err 142 } 143 144 err = mapping.AddCustomTokenFilter("edgeNgram3", map[string]any{ 145 "type": ngram.Name, 146 "min": 2.0, 147 "max": 3.0, 148 }) 149 if err != nil { 150 return nil, err 151 } 152 153 err = mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]any{ 154 "type": custom.Name, 155 "char_filters": []string{}, 156 "tokenizer": unicode.Name, 157 "token_filters": []string{unicodeNormalizeName, camelcase.Name, lowercase.Name}, 158 }) 159 if err != nil { 160 return nil, err 161 } 162 163 err = mapping.AddCustomAnalyzer("keyword_lowercase", map[string]any{ 164 "type": custom.Name, 165 "char_filters": []string{}, 166 "tokenizer": "single", 167 "token_filters": []string{lowercase.Name}, 168 }) 169 if err != nil { 170 return nil, err 171 } 172 173 err = mapping.AddCustomAnalyzer("trigram", map[string]any{ 174 "type": custom.Name, 175 "char_filters": []string{}, 176 "tokenizer": "single", 177 "token_filters": []string{lowercase.Name, "edgeNgram3"}, 178 }) 179 if err != nil { 180 return nil, err 181 } 182 183 mapping.DefaultAnalyzer = repoIndexerAnalyzer 184 mapping.AddDocumentMapping(repoIndexerDocType, docMapping) 185 mapping.AddDocumentMapping("_all", bleve.NewDocumentDisabledMapping()) 186 mapping.DefaultMapping = bleve.NewDocumentDisabledMapping() 187 188 return mapping, nil 189} 190 191func (ix *Indexer) initialize(ctx context.Context) (bool, error) { 192 if ix.indexer != nil { 193 return false, errors.New("indexer is already initialized") 194 } 195 196 indexer, err := openIndexer(ctx, ix.path, repoIndexerVersion) 197 if err != nil { 198 return false, err 199 } 200 if indexer != nil { 201 ix.indexer = indexer 202 return true, nil 203 } 204 205 mapping, err := generateRepoIndexMapping() 206 if err != nil { 207 return false, err 208 } 209 indexer, err = bleve.New(ix.path, mapping) 210 if err != nil { 211 return false, err 212 } 213 indexer.SetInternal([]byte("mapping_version"), []byte{byte(repoIndexerVersion)}) 214 215 ix.indexer = indexer 216 217 return false, nil 218} 219 220func openIndexer(ctx context.Context, path string, version int) (bleve.Index, error) { 221 l := tlog.FromContext(ctx) 222 indexer, err := bleve.Open(path) 223 if err != nil { 224 if errors.Is(err, upsidedown.IncompatibleVersion) { 225 l.Info("Indexer was built with a previous version of bleve, deleting and rebuilding") 226 return nil, os.RemoveAll(path) 227 } 228 return nil, nil 229 } 230 231 storedVersion, _ := indexer.GetInternal([]byte("mapping_version")) 232 if storedVersion == nil || int(storedVersion[0]) != version { 233 l.Info("Indexer mapping version changed, deleting and rebuilding") 234 indexer.Close() 235 return nil, os.RemoveAll(path) 236 } 237 238 return indexer, nil 239} 240 241func PopulateIndexer(ctx context.Context, ix *Indexer, e db.Execer) error { 242 l := tlog.FromContext(ctx) 243 count := 0 244 245 err := pagination.IterateAll( 246 func(page pagination.Page) ([]models.Repo, error) { 247 return db.GetReposPaginated(e, page) 248 }, 249 func(repos []models.Repo) error { 250 count += len(repos) 251 return ix.Index(ctx, repos...) 252 }, 253 ) 254 255 l.Info("repos indexed", "count", count) 256 return err 257} 258 259type repoData struct { 260 ID int64 `json:"id"` 261 RepoDid string `json:"repo_did"` 262 Did string `json:"did"` 263 Name string `json:"name"` 264 NameTrigram string `json:"name_trigram"` 265 Description string `json:"description"` 266 Website string `json:"website"` 267 Topics []string `json:"topics"` 268 TopicsExact []string `json:"topics_exact"` 269 Knot string `json:"knot"` 270 Language string `json:"language"` 271 IsFork bool `json:"is_fork"` 272 273 // sortable fields 274 StarCount int `json:"star_count"` 275 IssueCount int `json:"issue_count"` 276 PullCount int `json:"pull_count"` 277 Created time.Time `json:"created"` 278} 279 280func makeRepoData(repo *models.Repo) *repoData { 281 var language string 282 var starCount, issueCount, pullCount int 283 284 if repo.RepoStats != nil { 285 language = repo.RepoStats.Language 286 starCount = repo.RepoStats.StarCount 287 issueCount = repo.RepoStats.IssueCount.Open + repo.RepoStats.IssueCount.Closed 288 pullCount = repo.RepoStats.PullCount.Open + 289 repo.RepoStats.PullCount.Merged + 290 repo.RepoStats.PullCount.Closed 291 } 292 293 isFork := repo.Source != "" 294 295 return &repoData{ 296 ID: repo.Id, 297 RepoDid: repo.RepoDid, 298 Did: repo.Did, 299 Name: repo.Name, 300 NameTrigram: repo.Name, 301 Description: repo.Description, 302 Website: repo.Website, 303 Topics: repo.Topics, 304 TopicsExact: repo.Topics, 305 Knot: repo.Knot, 306 Language: language, 307 IsFork: isFork, 308 StarCount: starCount, 309 IssueCount: issueCount, 310 PullCount: pullCount, 311 Created: repo.Created, 312 } 313} 314 315// Type returns the document type, for bleve's mapping.Classifier interface. 316func (r *repoData) Type() string { 317 return repoIndexerDocType 318} 319 320type SearchResult struct { 321 Hits []int64 322 Total uint64 323 Duration time.Duration 324} 325 326const maxBatchSize = 20 327 328func (ix *Indexer) Index(ctx context.Context, repos ...models.Repo) error { 329 batch := bleveutil.NewFlushingBatch(ix.indexer, maxBatchSize) 330 for _, repo := range repos { 331 repoData := makeRepoData(&repo) 332 if err := batch.Index(base36.Encode(repo.Id), repoData); err != nil { 333 return err 334 } 335 } 336 return batch.Flush() 337} 338 339func (ix *Indexer) Delete(ctx context.Context, repoID int64) error { 340 return ix.indexer.Delete(base36.Encode(repoID)) 341} 342 343func (ix *Indexer) TotalDocCount() (uint64, error) { 344 return ix.indexer.DocCount() 345} 346 347func (ix *Indexer) Search(ctx context.Context, opts models.RepoSearchOptions) (*SearchResult, error) { 348 var musts []query.Query 349 var mustNots []query.Query 350 351 for _, keyword := range opts.Keywords { 352 musts = append(musts, bleve.NewDisjunctionQuery( 353 bleveutil.MatchAndQuery("name", keyword, repoIndexerAnalyzer, 0), 354 bleveutil.MatchAndQuery("name_trigram", keyword, "trigram", 0), 355 bleveutil.MatchAndQuery("description", keyword, repoIndexerAnalyzer, 0), 356 bleveutil.MatchAndQuery("website", keyword, repoIndexerAnalyzer, 0), 357 bleveutil.MatchAndQuery("topics", keyword, repoIndexerAnalyzer, 0), 358 )) 359 } 360 361 for _, phrase := range opts.Phrases { 362 musts = append(musts, bleve.NewDisjunctionQuery( 363 bleveutil.MatchPhraseQuery("name", phrase, repoIndexerAnalyzer), 364 bleveutil.MatchPhraseQuery("description", phrase, repoIndexerAnalyzer), 365 bleveutil.MatchPhraseQuery("website", phrase, repoIndexerAnalyzer), 366 bleveutil.MatchPhraseQuery("topics", phrase, repoIndexerAnalyzer), 367 )) 368 } 369 370 for _, keyword := range opts.NegatedKeywords { 371 mustNots = append(mustNots, bleve.NewDisjunctionQuery( 372 bleveutil.MatchAndQuery("name", keyword, repoIndexerAnalyzer, 0), 373 bleveutil.MatchAndQuery("description", keyword, repoIndexerAnalyzer, 0), 374 bleveutil.MatchAndQuery("website", keyword, repoIndexerAnalyzer, 0), 375 bleveutil.MatchAndQuery("topics", keyword, repoIndexerAnalyzer, 0), 376 )) 377 } 378 379 for _, phrase := range opts.NegatedPhrases { 380 mustNots = append(mustNots, bleve.NewDisjunctionQuery( 381 bleveutil.MatchPhraseQuery("name", phrase, repoIndexerAnalyzer), 382 bleveutil.MatchPhraseQuery("description", phrase, repoIndexerAnalyzer), 383 bleveutil.MatchPhraseQuery("website", phrase, repoIndexerAnalyzer), 384 bleveutil.MatchPhraseQuery("topics", phrase, repoIndexerAnalyzer), 385 )) 386 } 387 388 // keyword filters 389 if opts.Language != "" { 390 musts = append(musts, bleveutil.MatchAndQuery("language", opts.Language, "keyword_lowercase", 0)) 391 } 392 393 if opts.Knot != "" { 394 musts = append(musts, bleveutil.KeywordFieldQuery("knot", opts.Knot)) 395 } 396 397 if opts.Did != "" { 398 musts = append(musts, bleveutil.KeywordFieldQuery("did", opts.Did)) 399 } 400 401 for _, topic := range opts.Topics { 402 musts = append(musts, bleveutil.MatchAndQuery("topics_exact", topic, "keyword_lowercase", 0)) 403 } 404 405 for _, topic := range opts.NegatedTopics { 406 mustNots = append(mustNots, bleveutil.MatchAndQuery("topics_exact", topic, "keyword_lowercase", 0)) 407 } 408 409 indexerQuery := bleve.NewBooleanQuery() 410 if len(musts) == 0 { 411 musts = append(musts, bleve.NewMatchAllQuery()) 412 } 413 indexerQuery.AddMust(musts...) 414 indexerQuery.AddMustNot(mustNots...) 415 416 // use a disjunction where: 417 // - repos with more stars get higher boost 418 // - non-forks get a boost 419 // - boosts stack 420 finalQuery := bleve.NewDisjunctionQuery() 421 422 // add the main query 423 finalQuery.AddQuery(indexerQuery) 424 425 // add a boosted query for non-forks 426 notForkQuery := bleve.NewBooleanQuery() 427 notForkQuery.AddMust(indexerQuery) 428 isForkQuery := bleve.NewBoolFieldQuery(true) 429 isForkQuery.SetField("is_fork") 430 notForkQuery.AddMustNot(isForkQuery) 431 notForkQuery.SetBoost(2.0) 432 finalQuery.AddQuery(notForkQuery) 433 434 // add boosted queries for repos with more stars 435 // 10-99 stars 436 tier2Query := bleve.NewBooleanQuery() 437 tier2Query.AddMust(indexerQuery) 438 min10 := float64(10) 439 max99 := float64(99) 440 starRange2 := bleveutil.NumericRangeQuery("star_count", &min10, &max99) 441 tier2Query.AddMust(starRange2) 442 tier2Query.SetBoost(1.5) 443 finalQuery.AddQuery(tier2Query) 444 445 // 100-999 stars 446 tier3Query := bleve.NewBooleanQuery() 447 tier3Query.AddMust(indexerQuery) 448 min100 := float64(100) 449 max999 := float64(999) 450 starRange3 := bleveutil.NumericRangeQuery("star_count", &min100, &max999) 451 tier3Query.AddMust(starRange3) 452 tier3Query.SetBoost(2.5) 453 finalQuery.AddQuery(tier3Query) 454 455 // 1000+ stars 456 tier4Query := bleve.NewBooleanQuery() 457 tier4Query.AddMust(indexerQuery) 458 min1000 := float64(1000) 459 starRange4 := bleveutil.NumericRangeQuery("star_count", &min1000, nil) 460 tier4Query.AddMust(starRange4) 461 tier4Query.SetBoost(4.0) 462 finalQuery.AddQuery(tier4Query) 463 464 // use minimum of 1 to ensure all results match at least one clause 465 finalQuery.SetMin(1) 466 467 searchReq := bleve.NewSearchRequestOptions(finalQuery, opts.Page.Limit, opts.Page.Offset, false) 468 469 if opts.SortField != "" && opts.SortField != "relevance" { 470 var sortField string 471 472 switch opts.SortField { 473 case "created": 474 sortField = "created" 475 case "stars": 476 sortField = "star_count" 477 case "issues": 478 sortField = "issue_count" 479 case "pulls": 480 sortField = "pull_count" 481 default: 482 // invalid field, fall back to relevance 483 sortField = "" 484 } 485 486 if sortField != "" { 487 searchReq.SortByCustom(search.SortOrder{ 488 &search.SortField{ 489 Field: sortField, 490 Desc: opts.SortDesc, 491 }, 492 }) 493 } 494 } 495 496 res, err := ix.indexer.SearchInContext(ctx, searchReq) 497 if err != nil { 498 return nil, err 499 } 500 ret := &SearchResult{ 501 Total: res.Total, 502 Duration: res.Took, 503 Hits: make([]int64, len(res.Hits)), 504 } 505 for i, hit := range res.Hits { 506 id, err := base36.Decode(hit.ID) 507 if err != nil { 508 return nil, err 509 } 510 ret.Hits[i] = id 511 } 512 return ret, nil 513}