内容简介:搜索引擎最核心的地方在于倒排索引,而倒排索引其实并不是一种具体的数据结构,确切的来说是一类。 这个实现中使用Golang中的执行日志:TODO:
搜索引擎最核心的地方在于倒排索引,而倒排索引其实并不是一种具体的数据结构,确切的来说是一类。
这个实现中使用Golang中的 map
来做倒排索引,全部代码如下:
package main import ( "io/ioutil" "log" "os" ) // Index 倒排索引索引项 type Index struct { DocumentName string Offset int } // ToParsingFile 待索引的文件 type ToParsingFile struct { FileName string Content string } // ReverseIndex 单个倒排索引具体 type ReverseIndex map[string][]Index // Blacklist 倒排索引key的黑名单 type Blacklist map[string]bool // Ignorelist 忽略字符列表 type Ignorelist map[rune]bool func readFile(fileName string) (string, error) { contentBytes, err := ioutil.ReadFile(fileName) if err != nil { log.Printf("open file failed with error: %v\n", err) return "", err } return string(contentBytes), nil } func parse(ignorelist Ignorelist, fileName, content string) ReverseIndex { reverseFile := make(ReverseIndex) lastIndex := 0 for i, c := range(content) { if _, ok := ignorelist[c]; ok { newWord := string(content[lastIndex:i]) reverseFile[newWord] = append(reverseFile[newWord], Index{fileName, lastIndex}) lastIndex = i + 1 } } return reverseFile } // 合并多个倒排索引并且剔除blacklist中的key func mergeReverseIndex(blacklist Blacklist, args ...ReverseIndex) ReverseIndex { result := make(ReverseIndex) for _, arg := range(args) { for k, vs := range(arg) { // 忽略黑名单中的key if _, ok := blacklist[k]; ok { continue } // 忽略连续断字符组成的 if len(k) == 0 { continue } // TODO: write myself a `extend` function for Golang slice for _, v := range(vs) { result[k] = append(result[k], v) } } } return result } func getDocumentNames(reverseFile ReverseIndex, keyWord string) map[string]bool { resultMap := make(map[string]bool) indexList, ok := reverseFile[keyWord] if !ok { return resultMap } for _, index := range(indexList) { resultMap[index.DocumentName] = true } return resultMap } func makeSubSetOf(map1, map2 map[string]bool) map[string]bool { var longer, shorter map[string]bool resultMap := make(map[string]bool) if len(map1) > len(map2) { longer = map1 shorter = map2 } else { longer = map2 shorter = map1 } for k := range(shorter) { if _, ok := longer[k]; ok { resultMap[k] = true } } return resultMap } // 搜索,目前的检索方案为: // 检索出每个关键字所在的文章 // 对关键字做交集处理 // 取出所有关键字都存在的文章然后返回 func search(reverseFile ReverseIndex, keyWords ...string) []string { result := make([]string, 0) var finalDocumentNames map[string]bool for _, keyWord := range(keyWords) { documentNames := getDocumentNames(reverseFile, keyWord) if finalDocumentNames == nil { finalDocumentNames = documentNames } finalDocumentNames = makeSubSetOf(finalDocumentNames, documentNames) } for doc := range(finalDocumentNames) { result = append(result, doc) } return result } func main() { contents := make([]ToParsingFile, 0) files, err := ioutil.ReadDir("./docs") if err != nil { log.Panicf("read dir(./docs) failed, aborting...") } for _, file := range(files) { fileName := file.Name() content, err := readFile("./docs/" + fileName) if err != nil { log.Panicf("read file(%s) failed!", fileName) } contents = append(contents, ToParsingFile{fileName, content}) } blacklist := Blacklist{"a": false, "is": false} ignorelist := Ignorelist{ ' ': false, '\t': false, '\n': false, '\r': false, ',': false, '.': false, '"': false, ':': false, '`': false, '(': false, ')': false, '[': false, ']': false, '{': false, '}': false, '-': false, '_': false, '+': false, '=': false, '\\': false, '|': false, '\'': false, '<': false, '>': false, '$': false, } var reverseFiles []ReverseIndex for _, toParsingFile := range(contents) { reverseFiles = append(reverseFiles, parse(ignorelist, toParsingFile.FileName, toParsingFile.Content)) } reverseFile := mergeReverseIndex(blacklist, reverseFiles...) log.Printf("we're going to create reverse index") for k, vs := range reverseFile { log.Printf("Key: %s -> ", k) for _, v := range vs { log.Printf("in doc %s with offset %d, ", v.DocumentName, v.Offset) } } log.Printf("looping done") keyWords := os.Args[1:] searchResults := search(reverseFile, keyWords...) log.Printf("========================= Search Engine =================\n") log.Printf("searing %v...\n", keyWords) for _, searchResult := range(searchResults) { log.Printf("in file: %s\n", searchResult) } log.Printf("done.\n") }
执行日志:
$ go run reverse_file.go bigbang 2017/06/02 23:12:19 we're going to create reverse index 2017/06/02 23:12:19 Key: inherited -> 2017/06/02 23:12:19 in doc dev.rst with offset 297, 2017/06/02 23:12:19 Key: are -> 2017/06/02 23:12:19 in doc dev.rst with offset 16, 2017/06/02 23:12:19 Key: Installation -> 2017/06/02 23:12:19 in doc user.rst with offset 105, 2017/06/02 23:12:19 Key: about -> 2017/06/02 23:12:19 in doc user.rst with offset 78, 2017/06/02 23:12:19 Key: Using -> 2017/06/02 23:12:19 in doc user.rst with offset 266, 2017/06/02 23:12:19 Key: User -> 2017/06/02 23:12:19 in doc user.rst with offset 0, 2017/06/02 23:12:19 Key: of -> 2017/06/02 23:12:19 in doc dev.rst with offset 95, 2017/06/02 23:12:19 in doc user.rst with offset 36, 2017/06/02 23:12:19 in doc user.rst with offset 560, 2017/06/02 23:12:19 Key: method -> 2017/06/02 23:12:19 in doc dev.rst with offset 77, 2017/06/02 23:12:19 Key: by -> 2017/06/02 23:12:19 in doc user.rst with offset 337, 2017/06/02 23:12:19 Key: use -> 2017/06/02 23:12:19 in doc user.rst with offset 91, 2017/06/02 23:12:19 in doc user.rst with offset 309, 2017/06/02 23:12:19 Key: using -> 2017/06/02 23:12:19 in doc user.rst with offset 582, 2017/06/02 23:12:19 Key: set -> 2017/06/02 23:12:19 in doc user.rst with offset 468, 2017/06/02 23:12:19 in doc user.rst with offset 698, 2017/06/02 23:12:19 Key: for -> 2017/06/02 23:12:19 in doc dev.rst with offset 28, 2017/06/02 23:12:19 in doc dev.rst with offset 119, 2017/06/02 23:12:19 Key: loop -> 2017/06/02 23:12:19 in doc dev.rst with offset 231, 2017/06/02 23:12:19 in doc user.rst with offset 323, 2017/06/02 23:12:19 in doc user.rst with offset 381, 2017/06/02 23:12:19 in doc user.rst with offset 478, 2017/06/02 23:12:19 in doc user.rst with offset 567, 2017/06/02 23:12:19 in doc user.rst with offset 655, 2017/06/02 23:12:19 in doc user.rst with offset 679, 2017/06/02 23:12:19 in doc user.rst with offset 708, 2017/06/02 23:12:19 in doc user.rst with offset 713, 2017/06/02 23:12:19 Key: undoc -> 2017/06/02 23:12:19 in doc dev.rst with offset 279, 2017/06/02 23:12:19 Key: bigbang -> 2017/06/02 23:12:19 in doc dev.rst with offset 318, 2017/06/02 23:12:19 in doc user.rst with offset 720, 2017/06/02 23:12:19 Key: requires -> 2017/06/02 23:12:19 in doc user.rst with offset 168, 2017/06/02 23:12:19 Key: section -> 2017/06/02 23:12:19 in doc user.rst with offset 28, 2017/06/02 23:12:19 Key: templates -> 2017/06/02 23:12:19 in doc .gitignore with offset 16, 2017/06/02 23:12:19 Key: documentation -> 2017/06/02 23:12:19 in doc dev.rst with offset 102, 2017/06/02 23:12:19 in doc user.rst with offset 43, 2017/06/02 23:12:19 Key: or -> 2017/06/02 23:12:19 in doc dev.rst with offset 74, 2017/06/02 23:12:19 Key: event -> 2017/06/02 23:12:19 in doc dev.rst with offset 225, 2017/06/02 23:12:19 in doc user.rst with offset 317, 2017/06/02 23:12:19 in doc user.rst with offset 375, 2017/06/02 23:12:19 in doc user.rst with offset 472, 2017/06/02 23:12:19 in doc user.rst with offset 673, 2017/06/02 23:12:19 in doc user.rst with offset 702, 2017/06/02 23:12:19 Key: build -> 2017/06/02 23:12:19 in doc .gitignore with offset 1, 2017/06/02 23:12:19 Key: this -> 2017/06/02 23:12:19 in doc dev.rst with offset 85, 2017/06/02 23:12:19 Key: members -> 2017/06/02 23:12:19 in doc dev.rst with offset 186, 2017/06/02 23:12:19 in doc dev.rst with offset 267, 2017/06/02 23:12:19 in doc dev.rst with offset 285, 2017/06/02 23:12:19 in doc dev.rst with offset 307, 2017/06/02 23:12:19 Key: autofunction -> 2017/06/02 23:12:19 in doc dev.rst with offset 199, 2017/06/02 23:12:19 Key: Alternatively -> 2017/06/02 23:12:19 in doc user.rst with offset 518, 2017/06/02 23:12:19 Key: Use -> 2017/06/02 23:12:19 in doc user.rst with offset 190, 2017/06/02 23:12:19 Key: instance -> 2017/06/02 23:12:19 in doc user.rst with offset 551, 2017/06/02 23:12:19 Key: API -> 2017/06/02 23:12:19 in doc dev.rst with offset 0, 2017/06/02 23:12:19 Key: the -> 2017/06/02 23:12:19 in doc dev.rst with offset 98, 2017/06/02 23:12:19 in doc user.rst with offset 39, 2017/06/02 23:12:19 in doc user.rst with offset 313, 2017/06/02 23:12:19 in doc user.rst with offset 362, 2017/06/02 23:12:19 in doc user.rst with offset 563, 2017/06/02 23:12:19 Key: console -> 2017/06/02 23:12:19 in doc user.rst with offset 230, 2017/06/02 23:12:19 Key: autoclass -> 2017/06/02 23:12:19 in doc dev.rst with offset 148, 2017/06/02 23:12:19 in doc dev.rst with offset 240, 2017/06/02 23:12:19 Key: looking -> 2017/06/02 23:12:19 in doc dev.rst with offset 20, 2017/06/02 23:12:19 Key: provided -> 2017/06/02 23:12:19 in doc user.rst with offset 328, 2017/06/02 23:12:19 Key: If -> 2017/06/02 23:12:19 in doc dev.rst with offset 9, 2017/06/02 23:12:19 Key: 3 -> 2017/06/02 23:12:19 in doc user.rst with offset 184, 2017/06/02 23:12:19 Key: create -> 2017/06/02 23:12:19 in doc user.rst with offset 541, 2017/06/02 23:12:19 Key: manually -> 2017/06/02 23:12:19 in doc user.rst with offset 572, 2017/06/02 23:12:19 Key: an -> 2017/06/02 23:12:19 in doc user.rst with offset 548, 2017/06/02 23:12:19 Key: can -> 2017/06/02 23:12:19 in doc user.rst with offset 537, 2017/06/02 23:12:19 Key: function -> 2017/06/02 23:12:19 in doc dev.rst with offset 58, 2017/06/02 23:12:19 Key: new -> 2017/06/02 23:12:19 in doc dev.rst with offset 221, 2017/06/02 23:12:19 in doc user.rst with offset 669, 2017/06/02 23:12:19 Key: policy -> 2017/06/02 23:12:19 in doc user.rst with offset 386, 2017/06/02 23:12:19 in doc user.rst with offset 483, 2017/06/02 23:12:19 Key: It -> 2017/06/02 23:12:19 in doc user.rst with offset 165, 2017/06/02 23:12:19 Key: from -> 2017/06/02 23:12:19 in doc user.rst with offset 154, 2017/06/02 23:12:19 Key: available -> 2017/06/02 23:12:19 in doc user.rst with offset 144, 2017/06/02 23:12:19 Key: Guide -> 2017/06/02 23:12:19 in doc user.rst with offset 5, 2017/06/02 23:12:19 Key: uvloop -> 2017/06/02 23:12:19 in doc dev.rst with offset 130, 2017/06/02 23:12:19 in doc dev.rst with offset 160, 2017/06/02 23:12:19 in doc dev.rst with offset 214, 2017/06/02 23:12:19 in doc dev.rst with offset 252, 2017/06/02 23:12:19 in doc user.rst with offset 95, 2017/06/02 23:12:19 in doc user.rst with offset 133, 2017/06/02 23:12:19 in doc user.rst with offset 257, 2017/06/02 23:12:19 in doc user.rst with offset 272, 2017/06/02 23:12:19 in doc user.rst with offset 341, 2017/06/02 23:12:19 in doc user.rst with offset 367, 2017/06/02 23:12:19 in doc user.rst with offset 449, 2017/06/02 23:12:19 in doc user.rst with offset 490, 2017/06/02 23:12:19 in doc user.rst with offset 644, 2017/06/02 23:12:19 in doc user.rst with offset 662, 2017/06/02 23:12:19 Key: python -> 2017/06/02 23:12:19 in doc user.rst with offset 411, 2017/06/02 23:12:19 in doc user.rst with offset 606, 2017/06/02 23:12:19 Key: class -> 2017/06/02 23:12:19 in doc dev.rst with offset 68, 2017/06/02 23:12:19 Key: how -> 2017/06/02 23:12:19 in doc user.rst with offset 84, 2017/06/02 23:12:19 Key: it -> 2017/06/02 23:12:19 in doc user.rst with offset 209, 2017/06/02 23:12:19 Key: provides -> 2017/06/02 23:12:19 in doc user.rst with offset 57, 2017/06/02 23:12:19 Key: To -> 2017/06/02 23:12:19 in doc user.rst with offset 293, 2017/06/02 23:12:19 Key: 5 -> 2017/06/02 23:12:19 in doc user.rst with offset 186, 2017/06/02 23:12:19 Key: part -> 2017/06/02 23:12:19 in doc dev.rst with offset 90, 2017/06/02 23:12:19 Key: pip -> 2017/06/02 23:12:19 in doc user.rst with offset 194, 2017/06/02 23:12:19 in doc user.rst with offset 245, 2017/06/02 23:12:19 Key: asyncio -> 2017/06/02 23:12:19 in doc user.rst with offset 301, 2017/06/02 23:12:19 in doc user.rst with offset 430, 2017/06/02 23:12:19 in doc user.rst with offset 460, 2017/06/02 23:12:19 in doc user.rst with offset 625, 2017/06/02 23:12:19 in doc user.rst with offset 690, 2017/06/02 23:12:19 Key: PyPI -> 2017/06/02 23:12:19 in doc user.rst with offset 159, 2017/06/02 23:12:19 Key: static -> 2017/06/02 23:12:19 in doc .gitignore with offset 8, 2017/06/02 23:12:19 Key: specific -> 2017/06/02 23:12:19 in doc dev.rst with offset 49, 2017/06/02 23:12:19 Key: make -> 2017/06/02 23:12:19 in doc user.rst with offset 296, 2017/06/02 23:12:19 Key: Python -> 2017/06/02 23:12:19 in doc user.rst with offset 177, 2017/06/02 23:12:19 Key: code -> 2017/06/02 23:12:19 in doc user.rst with offset 217, 2017/06/02 23:12:19 in doc user.rst with offset 398, 2017/06/02 23:12:19 in doc user.rst with offset 593, 2017/06/02 23:12:19 Key: block -> 2017/06/02 23:12:19 in doc user.rst with offset 222, 2017/06/02 23:12:19 in doc user.rst with offset 403, 2017/06/02 23:12:19 in doc user.rst with offset 598, 2017/06/02 23:12:19 Key: you -> 2017/06/02 23:12:19 in doc dev.rst with offset 12, 2017/06/02 23:12:19 in doc dev.rst with offset 123, 2017/06/02 23:12:19 in doc user.rst with offset 350, 2017/06/02 23:12:19 in doc user.rst with offset 533, 2017/06/02 23:12:19 Key: EventLoopPolicy -> 2017/06/02 23:12:19 in doc dev.rst with offset 167, 2017/06/02 23:12:19 in doc user.rst with offset 497, 2017/06/02 23:12:19 Key: to -> 2017/06/02 23:12:19 in doc user.rst with offset 88, 2017/06/02 23:12:19 in doc user.rst with offset 198, 2017/06/02 23:12:19 Key: This -> 2017/06/02 23:12:19 in doc user.rst with offset 23, 2017/06/02 23:12:19 Key: on -> 2017/06/02 23:12:19 in doc dev.rst with offset 44, 2017/06/02 23:12:19 Key: import -> 2017/06/02 23:12:19 in doc user.rst with offset 423, 2017/06/02 23:12:19 in doc user.rst with offset 442, 2017/06/02 23:12:19 in doc user.rst with offset 618, 2017/06/02 23:12:19 in doc user.rst with offset 637, 2017/06/02 23:12:19 Key: install -> 2017/06/02 23:12:19 in doc user.rst with offset 201, 2017/06/02 23:12:19 in doc user.rst with offset 249, 2017/06/02 23:12:19 in doc user.rst with offset 354, 2017/06/02 23:12:19 Key: Loop -> 2017/06/02 23:12:19 in doc dev.rst with offset 259, 2017/06/02 23:12:19 Key: information -> 2017/06/02 23:12:19 in doc dev.rst with offset 32, 2017/06/02 23:12:19 in doc user.rst with offset 66, 2017/06/02 23:12:19 looping done 2017/06/02 23:12:19 ========================= Search Engine ================= 2017/06/02 23:12:19 searing [bigbang]... 2017/06/02 23:12:19 in file: dev.rst 2017/06/02 23:12:19 in file: user.rst 2017/06/02 23:12:19 done.
TODO:
- 使用B树替代哈希表做倒排索引
- 使用文件持久化倒排索引
以上所述就是小编给大家介绍的《自己写个搜索引擎》,希望对大家有所帮助,如果大家有任何疑问请给我留言,小编会及时回复大家的。在此也非常感谢大家对 码农网 的支持!
猜你喜欢:本站部分资源来源于网络,本站转载出于传递更多信息之目的,版权归原作者或者来源机构所有,如转载稿涉及版权问题,请联系我们。
Designing Data-Intensive Applications
Martin Kleppmann / O'Reilly Media / 2017-4-2 / USD 44.99
Data is at the center of many challenges in system design today. Difficult issues need to be figured out, such as scalability, consistency, reliability, efficiency, and maintainability. In addition, w......一起来看看 《Designing Data-Intensive Applications》 这本书的介绍吧!