ElasticSearch-7.3.0 進階語法

elasticsearch官方文檔

字段類型

# Text：被分析索引的字符串類型
# Keyword：不能被分析只能被精確匹配的字符串類型
# Date：日期類型，可以配置 format 一起使用（{"type": "date", "format": "yyyy-MM-dd"}）
# 數字類型：long，integer，short，double 等
# boolean 類型：true，false
# Array：數組類型 ["one", "two"]
# Object：json 嵌套（{"property1": "value1", "property2": "value2"}）
# Ip類型：127.0.0.1
# Geo_point：地理位置
	地址的定義：
	{
		"mappings": {
			"_doc": {
				"properties": {
					"location": {
						"type": "geo_point"
					}
				}
			}
		}
	}
	建立索引的方式：
	"location": {
		"lat": 41.12,
		"lon": -71.34
	}

高級查詢語法

`analyze`分析過程

# 使用 analyze api 查看分詞狀態
GET /movie/_analyze
{
  "field": "name",
  "text": "Eating an apple a day & keeps the doctor awawy"
}

# 使用結構化的方式重新創建索引（指定分詞器）
PUT /movie
{
  "settings": {
    "number_of_shards": 1,
    "number_of_replicas": 1
  },
  "mappings": {
    "properties": {
      "name": {
        "type": "text",
        "analyzer": "english"
      }
    }
  }
}

`Tmdb`實例

數據下載

在網上直接搜索kiggle tmdb即可下載相對應的數據文件

索引建立

# 建立 movie 索引
PUT /movie
{
  "settings": {
    "number_of_shards": 1,
    "number_of_replicas": 1
  },
  "mappings": {
    "properties": {
      "title": {
        "type": "text",
        "analyzer": "english"
      },
      "tagline": {
        "type": "text",
        "analyzer": "english"
      },
      "release_date": {
        "type": "date",
        "format": "8yyyy/MM/dd||yyyy/M/dd||yyyy/MM/d||yyyy/M/d"
      },
      "popularity": {
        "type": "double"
      },
      "overview": {
        "type": "text",
        "analyzer": "english"
      },
      "cast": {
        "type": "object",
        "properties": {
          "character": {
            "type": "text",
            "analyzer": "standard"
          },
          "name": {
            "type": "text",
            "analyzer": "standard"
          }
        }
      }
    }
  }
}

`match`和`term`

# match 查詢
GET /movie/_search
{
  "query": {
    "match": {
      "title": "steve zissou"
    }
  }
}

# term 查詢
GET /movie/_search
{
  "query": {
    "term": {
      "title": {
        "value": "steve zissou"
      }
    }
  }
}

# match 查詢會根據字段所指定的分詞器對查詢字段進行分詞，而 term 並不會對查詢字段進行分詞，也就是說對於上面兩個示例，title 指定的是 english 分詞器，所以 match 查詢中的 steve zissou 會被分詞器解析成 steve 和 zissou 兩個關鍵詞，所以只要 title 中含有 steve 和 zissou 中任意一個關鍵詞的都可以被命中，而 term 查詢中的 steve zissou 不會被分詞器解析，也就是說只有 title 中經過分詞器解析後的詞包含 steve zissou 時纔會被命中。

分詞後的`and`和`or`

# 分詞後的 or 的邏輯
GET /movie/_search
{
  "query": {
    "match": {
      "title": "basketball with cartoom aliens"
    }
  }
}

# 分詞後的 and 的邏輯
GET /movie/_search
{
  "query": {
    "match": {
      "title": {
        "query": "basketball with cartoom aliens",
        "operator": "and"
      }
    }
  }
}

最小詞匹配項

# 最小詞匹配項
GET /movie/_search
{
  "query": {
    "match": {
      "title": {
        "query": "basketball love aliens",
        "operator": "or",
        "minimum_should_match": 2
      }
    }
  }
}

短語查詢

# 短語查詢
GET /movie/_search
{
  "query": {
    "match_phrase": {
      "title": "steve zissou"
    }
  }
}

`score`打分

# 查看 score
GET /movie/_search
{
  "explain": true,
  "query": {
    "match": {
      "title": "steve"
    }
  }
}
======================================================
"details" : [
 {
	# 2.2 * 7.1592917 * 0.47008154 = 7.403992
    "value" : 7.403992,
    "description" : "score(freq=1.0), product of:",
    "details" : [
      {
        "value" : 2.2,
        # 可以手動指定這個 boost 放大係數，如果不指定，那麼 es 將使用默認值爲 2.2 的放大係數
        "description" : "boost",
        "details" : [ ]
      },
      {
        "value" : 7.1592917,
        # 逆文檔頻率：隨着 n 的增加，整個 idf 是減少的
        "description" : "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:",
        "details" : [
          {
            "value" : 3,
            # 一共命中了 3 篇文檔
            "description" : "n, number of documents containing term",
            "details" : [ ]
          },
          {
            "value" : 4500,
            # 文檔的總個數爲 4500
            "description" : "N, total number of documents with field",
            "details" : [ ]
          }
        ]
      },
      {
        "value" : 0.47008154,
        "description" : "tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:",
        "details" : [
          {
            "value" : 1.0,
            # 搜索關鍵詞在文檔字段中出現的次數
            "description" : "freq, occurrences of term within document",
            "details" : [ ]
          },
          {
            "value" : 1.2,
            "description" : "k1, term saturation parameter",
            "details" : [ ]
          },
          {
            "value" : 0.75,
            "description" : "b, length normalization parameter",
            "details" : [ ]
          },
          {
            "value" : 2.0,
            # 文檔字段的長度
            "description" : "dl, length of field",
            "details" : [ ]
          },
          {
            "value" : 2.1757777,
            "description" : "avgdl, average length of field",
            "details" : [ ]
          }
        ]
      }
    ]
  }
]

多字段查詢

# 多字段查詢：在多字段查詢時，會對兩個字段都進行打分，最後的打分結果取的是最大的那個分值
GET /movie/_search
{
  "query": {
    "multi_match": {
      "query": "basketball with cartoom aliens",
      "fields": ["title", "overview"]
    }
  }
}

# 優化多字段查詢：讓 title 字段佔比加大
GET /movie/_search
{
  "query": {
    "multi_match": {
      "query": "basketball with cartoom aliens",
      "fields": ["title^10", "overview"]
    }
  }
}

# 優化多字段查詢
GET /movie/_search
{
  "explain": true, 
  "query": {
    "multi_match": {
      "query": "basketball with cartoom aliens",
      "fields": ["title^10", "overview"],
      "tie_breaker": 0.3
    }
  }
}

# bool 查詢
# must：必須都爲 true
# must not：必須都是 false
# should：其中只要有一個爲 true，即可
# 爲 true 的越多則得分越高
GET /movie/_search
{
  "explain": true, 
  "query": {
    "bool": {
      "should": [
        {
          "match": {
            "title": "basketball with cartoom aliens"
          }
        },
        {
          "match": {
            "overview": "basketball with cartoom aliens"
          }
        }
      ]
    }
  }
}

# 不同的 multi_query 其實是有不同的 type，type 不同則打分方式不同
# best_fields：默認的得分方式，取得最高的分數作爲對應文檔的得分，“最匹配模式” -> dis_max
GET /movie/_search
{
  "query": {
    "multi_match": {
      "query": "basketball with cartoom aliens",
      "fields": ["title", "overview"],
      "type": "best_fields"
    }
  }
}

# dis_max
GET /movie/_search
{
  "explain": true, 
  "query": {
    "dis_max": {
      "queries": [
        {
          "match": {
            "title": "basketball with cartoom aliens"
          }
        },
        {
          "match": {
            "overview": "basketball with cartoom aliens"
          }
        }
      ]
    }
  }
}

# 查看打分規則：dis_max
GET /movie/_validate/query?explain
{
  "query": {
    "multi_match": {
      "query": "basketball with cartoom aliens",
      "fields": ["title^10", "overview"],
      "type": "best_fields"
    }
  }
}

# most_fields：考慮絕大多數（所有的），文檔的字段得分相加獲得我們想要的結果
GET /movie/_search
{
  "explain": true, 
  "query": {
    "multi_match": {
      "query": "basketball with cartoom aliens",
      "fields": ["title", "overview"],
      "type": "most_fields"
    }
  }
}

# 權重的調整是針對於 boost進行調整
GET /movie/_validate/query?explain
{
  "query": {
    "multi_match": {
      "query": "basketball with cartoom aliens",
      "fields": ["title^10", "overview^0.1"],
      "type": "most_fields"
    }
  }
}

# cross_fields：以分詞爲單位計算欄位的總分，適用於詞導向的匹配模式
GET /movie/_search
{
  "explain": true, 
  "query": {
    "multi_match": {
      "query": "steve jobs",
      "fields": ["title", "overview"],
      "type": "cross_fields"
    }
  }
}

GET /movie/_validate/query?explain
{
  "query": {
    "multi_match": {
      "query": "steve jobs",
      "fields": ["title", "overview"],
      "type": "cross_fields"
    }
  }
}

# query string
# 方便的利用 AND OR NOT
GET /movie/_search
{
  "query": {
    "query_string": {
      "fields": ["title"],
      "query": "steve AND jobs"
    }
  }
}

過濾與排序

# filter 過濾查詢
# 單條件過濾
GET /movie/_search
{
  "query": {
    "bool": {
      "filter": {
        "term": {
          "title": "steve"
        }
      }
    }
  }
}

# 多條件過濾
GET /movie/_search
{
  "query": {
    "bool": {
      "filter": [
        {
          "term": {
            "title": "steve"
          }
        },
        {
          "term": {
            "cast.name": "gaspard"
          }
        }
      ]
    }
  }
}

# 多條件過濾
GET /movie/_search
{
  "query": {
    "bool": {
      "filter": [
        {
          "term": {
            "title": "steve"
          }
        },
        {
          "term": {
            "cast.name": "gaspard"
          }
        },
        {
          "range": {
            "release_date": {
              "lte": "2015/01/01"
            }
          }
        },
        {
          "range": {
            "popularity": {
              "gte": 25
            }
          }
        }
      ]
    }
  }
}

# 多條件過濾並排序
GET /movie/_search
{
  "query": {
    "bool": {
      "filter": [
        {
          "term": {
            "title": "steve"
          }
        },
        {
          "term": {
            "cast.name": "gaspard"
          }
        },
        {
          "range": {
            "release_date": {
              "lte": "2015/01/01"
            }
          }
        },
        {
          "range": {
            "popularity": {
              "gte": 25
            }
          }
        }
      ]
    }
  },
  "sort": [
    {
      "popularity": {
        "order": "desc"
      }
    }
  ]
}

# 帶 match 打分的 filter，should 控制打分，filter 控制過濾
GET /movie/_search
{
  "query": {
    "bool": {
      "should": [
        {
          "match": {
            "title": "life"
          }
        }
      ],
      "filter": [
        {
          "term": {
            "title": "steve"
          }
        },
        {
          "term": {
            "cast.name": "gaspard"
          }
        },
        {
          "range": {
            "release_date": {
              "lte": "2015/01/01"
            }
          }
        },
        {
          "range": {
            "popularity": {
              "gte": 25
            }
          }
        }
      ]
    }
  }
}

查全率查準率

查全率：正確的結果有 n 個，查詢出來正確的有 m 個，所以查全率就是 m / n
查準率：查出的 n 個文檔有 m 個文檔是正確的，所以查準率就是 m / n
兩者不可兼得，但是可以調整順序

通常可以追求高的查全率，因爲查全率高必然會導致查準率降低，保證查準率中所查詢的 m 個內容排在前面，這樣既可以保證用戶體驗，還可以保證查全率。

自定義`score`

# function-score
GET /movie/_search
{
  "explain": true, 
  "query": {
    "function_score": {
      # 原始查詢得到的 oldScore
      "query": {
        "multi_match": {
          "query": "steve job",
          "fields": [
            "title",
            "overview"
          ],
          "operator": "or",
          "type": "most_fields"
        }
      },
      "functions": [
        {
          "field_value_factor": {
            # 對應要調整處理的字段
            "field": "popularity",
            "modifier": "log2p",
            "factor": 10
          }
        }
      ]
    }
  }
}

# function-score
GET /movie/_search
{
  "explain": true, 
  "query": {
    "function_score": {
      # 原始查詢得到的 oldScore
      "query": {
        "multi_match": {
          "query": "steve job",
          "fields": [
            "title",
            "overview"
          ],
          "operator": "or",
          "type": "most_fields"
        }
      },
      "functions": [
        {
          "field_value_factor": {
            # 對應要調整處理的字段
            "field": "popularity",
            "modifier": "log2p",
            "factor": 10
          }
        },
        {
          "field_value_factor": {
            "field": "popularity",
            "modifier": "log2p",
            "factor": 5
          }
        }
      ],
      # 不同的 field value 之間的得分相加
      "score_mode": "sum",
      # 最後再與 old value 相加
      "boost_mode": "sum"
    }
  }
}

ElasticSearch-7.3.0 進階語法

ElasticSearch-7.3.0 進階語法

字段類型

高級查詢語法

`analyze`分析過程

`Tmdb`實例

數據下載

索引建立

`match`和`term`

分詞後的`and`和`or`

最小詞匹配項

短語查詢

`score`打分

多字段查詢

過濾與排序

查全率查準率

自定義`score`

python gdal 安裝使用（Windows， python 3.6.8）

Spark常見的Transformation算子（三）

Hadoop註解InterfaceAudience InterfaceStability

MapReduce編程實例

Sqoop錯誤

MapReduce—平均工資

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結

ElasticSearch-7.3.0 進階語法

ElasticSearch-7.3.0 進階語法

字段類型

高級查詢語法

analyze分析過程

Tmdb實例

數據下載

索引建立

match和term

分詞後的and和or

最小詞匹配項

短語查詢

score打分

多字段查詢

過濾與排序

查全率查準率

自定義score

`analyze`分析過程

`Tmdb`實例

`match`和`term`

分詞後的`and`和`or`

`score`打分

自定義`score`