初始化,构建elasticsearch-hanlp-dockerfile
This commit is contained in:
commit
3567103a3e
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
/src/zqyy/elasticsearch/*.tar
|
||||
62
README.md
Normal file
62
README.md
Normal file
@ -0,0 +1,62 @@
|
||||
# zqyy-dockerfile
|
||||
|
||||
This repository keeps Dockerfiles under each image directory and uses one root
|
||||
`docker-compose.yml` as the unified build/run entrypoint.
|
||||
|
||||
## Layout
|
||||
|
||||
```text
|
||||
docker-compose.yml
|
||||
src/
|
||||
zqyy/
|
||||
elasticsearch/
|
||||
Dockerfile
|
||||
config/
|
||||
dictionaries/
|
||||
README.md
|
||||
test-hanlp.sh
|
||||
```
|
||||
|
||||
## Build Images
|
||||
|
||||
Build one image:
|
||||
|
||||
```bash
|
||||
docker compose build elasticsearch-hanlp
|
||||
```
|
||||
|
||||
Build all images defined in the root compose file:
|
||||
|
||||
```bash
|
||||
docker compose build
|
||||
```
|
||||
|
||||
## Run Services
|
||||
|
||||
Run one service:
|
||||
|
||||
```bash
|
||||
docker compose up -d elasticsearch-hanlp
|
||||
```
|
||||
|
||||
Run all services:
|
||||
|
||||
```bash
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
## Add A New Image
|
||||
|
||||
1. Create a new image directory under `src/zqyy/<image-name>/`.
|
||||
2. Put that image's `Dockerfile` and related files in the image directory.
|
||||
3. Add a service in the root `docker-compose.yml`.
|
||||
4. Set `build.context` to the image directory, for example:
|
||||
|
||||
```yaml
|
||||
services:
|
||||
example:
|
||||
image: zqyy/example:latest
|
||||
build:
|
||||
context: ./src/zqyy/example
|
||||
dockerfile: Dockerfile
|
||||
```
|
||||
39
docker-compose.yml
Normal file
39
docker-compose.yml
Normal file
@ -0,0 +1,39 @@
|
||||
services:
|
||||
elasticsearch-hanlp:
|
||||
image: zqyy/elasticsearch-hanlp:8.7.0
|
||||
build:
|
||||
context: ./src/zqyy/elasticsearch
|
||||
dockerfile: Dockerfile
|
||||
args:
|
||||
ELASTIC_VERSION: "8.7.0"
|
||||
container_name: zqyy-elasticsearch-hanlp
|
||||
environment:
|
||||
discovery.type: single-node
|
||||
xpack.security.enabled: "false"
|
||||
xpack.security.enrollment.enabled: "false"
|
||||
ES_JAVA_OPTS: "-Xms1g -Xmx1g"
|
||||
ports:
|
||||
- "9200:9200"
|
||||
- "9300:9300"
|
||||
volumes:
|
||||
- elasticsearch-hanlp-data:/usr/share/elasticsearch/data
|
||||
# Optional: mount local dictionary/config files during development.
|
||||
# Rebuild the image for production so every node uses the same files.
|
||||
# - ./src/zqyy/elasticsearch/dictionaries/custom/zqyy-custom.txt:/usr/share/elasticsearch/plugins/analysis-hanlp/data/dictionary/custom/zqyy-custom.txt:ro
|
||||
# - ./src/zqyy/elasticsearch/config/hanlp.properties:/usr/share/elasticsearch/config/analysis-hanlp/hanlp.properties:ro
|
||||
# - ./src/zqyy/elasticsearch/config/hanlp-remote.xml:/usr/share/elasticsearch/config/analysis-hanlp/hanlp-remote.xml:ro
|
||||
ulimits:
|
||||
memlock:
|
||||
soft: -1
|
||||
hard: -1
|
||||
nofile:
|
||||
soft: 65535
|
||||
hard: 65535
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "curl -fsS http://localhost:9200/_cluster/health?wait_for_status=yellow&timeout=1s >/dev/null || exit 1"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 30
|
||||
|
||||
volumes:
|
||||
elasticsearch-hanlp-data:
|
||||
47
src/zqyy/elasticsearch/Dockerfile
Normal file
47
src/zqyy/elasticsearch/Dockerfile
Normal file
@ -0,0 +1,47 @@
|
||||
# syntax=docker/dockerfile:1.7
|
||||
|
||||
ARG ELASTIC_VERSION=8.7.0
|
||||
ARG HANLP_REPO=https://gitee.com/xuxingchao/elasticsearch-analysis-hanlp.git
|
||||
ARG HANLP_REF=master
|
||||
|
||||
FROM maven:3.9.9-eclipse-temurin-17 AS hanlp-builder
|
||||
|
||||
ARG ELASTIC_VERSION
|
||||
ARG HANLP_REPO
|
||||
ARG HANLP_REF
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y --no-install-recommends git unzip \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /build
|
||||
|
||||
RUN git clone --depth 1 --branch "${HANLP_REF}" "${HANLP_REPO}" hanlp
|
||||
|
||||
WORKDIR /build/hanlp
|
||||
|
||||
RUN grep -q "<version>${ELASTIC_VERSION}</version>" pom.xml \
|
||||
&& mvn -B -DskipTests package \
|
||||
&& mkdir -p /plugin \
|
||||
&& unzip -q target/releases/elasticsearch-analysis-hanlp-*.zip -d /plugin \
|
||||
&& cp -a src/main/resources/data /plugin/analysis-hanlp/data \
|
||||
&& test -f /plugin/analysis-hanlp/plugin-descriptor.properties
|
||||
|
||||
FROM docker.elastic.co/elasticsearch/elasticsearch:${ELASTIC_VERSION}
|
||||
|
||||
ARG ELASTIC_VERSION
|
||||
|
||||
LABEL org.opencontainers.image.title="zqyy/elasticsearch-hanlp" \
|
||||
org.opencontainers.image.description="Elasticsearch ${ELASTIC_VERSION} with HanLP analysis plugin" \
|
||||
org.opencontainers.image.source="https://gitee.com/xuxingchao/elasticsearch-analysis-hanlp"
|
||||
|
||||
RUN mkdir -p /usr/share/elasticsearch/config/analysis-hanlp
|
||||
|
||||
COPY --from=hanlp-builder --chown=elasticsearch:root /plugin/analysis-hanlp /usr/share/elasticsearch/plugins/analysis-hanlp
|
||||
COPY --chown=elasticsearch:root config/hanlp.properties /usr/share/elasticsearch/config/analysis-hanlp/hanlp.properties
|
||||
COPY --chown=elasticsearch:root config/hanlp-remote.xml /usr/share/elasticsearch/config/analysis-hanlp/hanlp-remote.xml
|
||||
COPY --chown=elasticsearch:root config/hanlp.properties /usr/share/elasticsearch/plugins/analysis-hanlp/config/hanlp.properties
|
||||
COPY --chown=elasticsearch:root config/hanlp-remote.xml /usr/share/elasticsearch/plugins/analysis-hanlp/config/hanlp-remote.xml
|
||||
COPY --chown=elasticsearch:root dictionaries/custom/ /usr/share/elasticsearch/plugins/analysis-hanlp/data/dictionary/custom/
|
||||
|
||||
RUN /usr/share/elasticsearch/bin/elasticsearch-plugin list | grep -q '^analysis-hanlp$'
|
||||
208
src/zqyy/elasticsearch/README.md
Normal file
208
src/zqyy/elasticsearch/README.md
Normal file
@ -0,0 +1,208 @@
|
||||
# zqyy/elasticsearch-hanlp
|
||||
|
||||
Elasticsearch 8.7.0 image with the HanLP analysis plugin.
|
||||
|
||||
The HanLP plugin source used by this Dockerfile currently declares Elasticsearch
|
||||
8.7.0 as its runtime dependency, so the image is intentionally pinned to 8.7.0.
|
||||
Changing `ELASTIC_VERSION` requires a matching HanLP plugin branch or source
|
||||
revision.
|
||||
|
||||
## Build
|
||||
|
||||
From the project root:
|
||||
|
||||
```bash
|
||||
docker compose build elasticsearch-hanlp
|
||||
```
|
||||
|
||||
## Run with Docker Compose
|
||||
|
||||
From the project root:
|
||||
|
||||
```bash
|
||||
docker compose up -d elasticsearch-hanlp
|
||||
```
|
||||
|
||||
## HanLP Dictionary Maintenance
|
||||
|
||||
This image keeps HanLP files in the same paths used by the plugin:
|
||||
|
||||
| Purpose | Source path in this project | Path inside image/container |
|
||||
| --- | --- | --- |
|
||||
| HanLP main config | `config/hanlp.properties` | `/usr/share/elasticsearch/config/analysis-hanlp/hanlp.properties` |
|
||||
| Remote dictionary config | `config/hanlp-remote.xml` | `/usr/share/elasticsearch/config/analysis-hanlp/hanlp-remote.xml` |
|
||||
| Plugin fallback config | `config/hanlp.properties` | `/usr/share/elasticsearch/plugins/analysis-hanlp/config/hanlp.properties` |
|
||||
| Business custom dictionary | `dictionaries/custom/zqyy-custom.txt` | `/usr/share/elasticsearch/plugins/analysis-hanlp/data/dictionary/custom/zqyy-custom.txt` |
|
||||
| Built-in HanLP dictionaries | copied from plugin source `src/main/resources/data` | `/usr/share/elasticsearch/plugins/analysis-hanlp/data` |
|
||||
|
||||
The plugin reads `hanlp.properties` from:
|
||||
|
||||
```text
|
||||
/usr/share/elasticsearch/config/analysis-hanlp/hanlp.properties
|
||||
```
|
||||
|
||||
The `root` value in `hanlp.properties` is:
|
||||
|
||||
```properties
|
||||
root=plugins/analysis-hanlp/
|
||||
```
|
||||
|
||||
So dictionary paths are resolved under:
|
||||
|
||||
```text
|
||||
/usr/share/elasticsearch/plugins/analysis-hanlp/
|
||||
```
|
||||
|
||||
### Included Dictionary Files
|
||||
|
||||
The Dockerfile copies the common HanLP data shipped with the plugin source into
|
||||
the image. Important paths include:
|
||||
|
||||
```text
|
||||
data/dictionary/CoreNatureDictionary.txt
|
||||
data/dictionary/CoreNatureDictionary.ngram.txt
|
||||
data/dictionary/stopwords.txt
|
||||
data/dictionary/synonym/CoreSynonym.txt
|
||||
data/dictionary/custom/CustomDictionary.txt
|
||||
data/dictionary/custom/ModernChineseSupplementaryWord.txt
|
||||
data/dictionary/custom/ChinesePlaceName.txt
|
||||
data/dictionary/custom/PersonalName.txt
|
||||
data/dictionary/custom/OrganizationName.txt
|
||||
data/dictionary/custom/ShanghaiPlaceName.txt
|
||||
data/dictionary/person/nr.txt
|
||||
data/dictionary/person/nrf.txt
|
||||
data/dictionary/place/ns.txt
|
||||
data/dictionary/organization/nt.txt
|
||||
data/dictionary/tc/
|
||||
```
|
||||
|
||||
This project also adds:
|
||||
|
||||
```text
|
||||
data/dictionary/custom/zqyy-custom.txt
|
||||
```
|
||||
|
||||
and registers it in `CustomDictionaryPath`:
|
||||
|
||||
```properties
|
||||
CustomDictionaryPath=data/dictionary/custom/CustomDictionary.txt; ModernChineseSupplementaryWord.txt; ChinesePlaceName.txt ns; PersonalName.txt; OrganizationName.txt; ShanghaiPlaceName.txt ns; zqyy-custom.txt nz; data/dictionary/person/nrf.txt nrf;
|
||||
```
|
||||
|
||||
### Add Or Change Business Words
|
||||
|
||||
Edit:
|
||||
|
||||
```text
|
||||
dictionaries/custom/zqyy-custom.txt
|
||||
```
|
||||
|
||||
Format:
|
||||
|
||||
```text
|
||||
word nature frequency
|
||||
```
|
||||
|
||||
Examples:
|
||||
|
||||
```text
|
||||
zqyy nz 1000
|
||||
某业务系统 nz 1000
|
||||
张三 nr 1000
|
||||
上海浦东 ns 1000
|
||||
某某科技有限公司 nt 1000
|
||||
```
|
||||
|
||||
Common natures:
|
||||
|
||||
| Nature | Meaning |
|
||||
| --- | --- |
|
||||
| `nz` | other proper noun, common for products and domain terms |
|
||||
| `ns` | place name |
|
||||
| `nr` | person name |
|
||||
| `nt` | organization name |
|
||||
| `n` | noun |
|
||||
| `v` | verb |
|
||||
|
||||
After changing local files, rebuild and restart:
|
||||
|
||||
```bash
|
||||
docker compose build --no-cache elasticsearch-hanlp
|
||||
docker compose up -d elasticsearch-hanlp
|
||||
```
|
||||
|
||||
For local development only, you can bind-mount the dictionary or config files.
|
||||
See the commented `volumes` lines in the root `docker-compose.yml`. The plugin
|
||||
supports custom dictionary hot reload, but production images should be rebuilt
|
||||
so every node has identical dictionary files.
|
||||
|
||||
### Remote Dictionaries
|
||||
|
||||
Remote dictionaries are configured in:
|
||||
|
||||
```text
|
||||
config/hanlp-remote.xml
|
||||
```
|
||||
|
||||
Example:
|
||||
|
||||
```xml
|
||||
<entry key="remote_ext_dict">http://example.com/hanlp/ext.dic nz</entry>
|
||||
<entry key="remote_ext_stopwords">http://example.com/hanlp/stopwords.txt</entry>
|
||||
```
|
||||
|
||||
If remote dictionaries are not needed, keep the entries commented out.
|
||||
|
||||
## Quick Test
|
||||
|
||||
```bash
|
||||
curl http://localhost:9200/_cat/plugins?v
|
||||
|
||||
curl -X POST "http://localhost:9200/_analyze?pretty" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"tokenizer": "hanlp",
|
||||
"text": "中华人民共和国国歌"
|
||||
}'
|
||||
```
|
||||
|
||||
## Index Example
|
||||
|
||||
```bash
|
||||
curl -X PUT "http://localhost:9200/hanlp-demo" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"settings": {
|
||||
"analysis": {
|
||||
"analyzer": {
|
||||
"hanlp_analyzer": {
|
||||
"type": "custom",
|
||||
"tokenizer": "hanlp"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"mappings": {
|
||||
"properties": {
|
||||
"title": {
|
||||
"type": "text",
|
||||
"analyzer": "hanlp_analyzer",
|
||||
"search_analyzer": "hanlp_analyzer"
|
||||
}
|
||||
}
|
||||
}
|
||||
}'
|
||||
|
||||
curl -X POST "http://localhost:9200/hanlp-demo/_doc/1?refresh=true" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"title":"中华人民共和国国歌"}'
|
||||
|
||||
curl -X GET "http://localhost:9200/hanlp-demo/_search?pretty" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"query": {
|
||||
"match": {
|
||||
"title": "国歌"
|
||||
}
|
||||
}
|
||||
}'
|
||||
```
|
||||
11
src/zqyy/elasticsearch/config/hanlp-remote.xml
Normal file
11
src/zqyy/elasticsearch/config/hanlp-remote.xml
Normal file
@ -0,0 +1,11 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
|
||||
<properties>
|
||||
<comment>HanLP Analyzer remote dictionary configuration</comment>
|
||||
|
||||
<!-- Remote extension dictionary. Example: http://example.com/hanlp/ext.dic nz -->
|
||||
<!--<entry key="remote_ext_dict">words_location</entry>-->
|
||||
|
||||
<!-- Remote stop-word dictionary. Example: http://example.com/hanlp/stopwords.txt -->
|
||||
<!--<entry key="remote_ext_stopwords">stop_words_location</entry>-->
|
||||
</properties>
|
||||
47
src/zqyy/elasticsearch/config/hanlp.properties
Normal file
47
src/zqyy/elasticsearch/config/hanlp.properties
Normal file
@ -0,0 +1,47 @@
|
||||
# Root path of HanLP data, relative to ES_HOME.
|
||||
root=plugins/analysis-hanlp/
|
||||
|
||||
# Core dictionary path
|
||||
CoreDictionaryPath=data/dictionary/CoreNatureDictionary.txt
|
||||
|
||||
# BiGram dictionary path
|
||||
BiGramDictionaryPath=data/dictionary/CoreNatureDictionary.ngram.txt
|
||||
|
||||
# Core stop word dictionary path
|
||||
CoreStopWordDictionaryPath=data/dictionary/stopwords.txt
|
||||
|
||||
# Core synonym dictionary path
|
||||
CoreSynonymDictionaryDictionaryPath=data/dictionary/synonym/CoreSynonym.txt
|
||||
|
||||
# Person name dictionary path
|
||||
PersonDictionaryPath=data/dictionary/person/nr.txt
|
||||
|
||||
# Person name dictionary tr path
|
||||
PersonDictionaryTrPath=data/dictionary/person/nr.tr.txt
|
||||
|
||||
# Traditional/Simplified Chinese conversion dictionary root path
|
||||
tcDictionaryRoot=data/dictionary/tc
|
||||
|
||||
# Custom dictionary path.
|
||||
# Paths after the first custom path may be relative to data/dictionary/custom.
|
||||
# Format: word [nature] [frequency], one word per line in the dictionary file.
|
||||
CustomDictionaryPath=data/dictionary/custom/CustomDictionary.txt; ModernChineseSupplementaryWord.txt; ChinesePlaceName.txt ns; PersonalName.txt; OrganizationName.txt; ShanghaiPlaceName.txt ns; zqyy-custom.txt nz; data/dictionary/person/nrf.txt nrf;
|
||||
|
||||
# HMM segment model path
|
||||
#HMMSegmentModelPath=data/model/segment/HMMSegmentModel.bin
|
||||
|
||||
# Show term nature
|
||||
#ShowTermNature=true
|
||||
|
||||
# IO Adapter
|
||||
##IOAdapter=com.hankcs.hanlp.corpus.io.FileIOAdapter
|
||||
|
||||
# Perceptron model paths. These require the corresponding model files to exist.
|
||||
PerceptronCWSModelPath=data/model/perceptron/pku1998/cws.bin
|
||||
PerceptronPOSModelPath=data/model/perceptron/pku1998/pos.bin
|
||||
PerceptronNERModelPath=data/model/perceptron/pku1998/ner.bin
|
||||
|
||||
# CRF model paths. These require the corresponding model files to exist.
|
||||
CRFCWSModelPath=data/model/crf/pku199801/cws.txt.bin
|
||||
CRFPOSModelPath=data/model/crf/pku199801/pos.txt.bin
|
||||
CRFNERModelPath=data/model/crf/pku199801/ner.txt.bin
|
||||
@ -0,0 +1,4 @@
|
||||
zqyy nz 1000
|
||||
HanLP nz 1000
|
||||
Elasticsearch nz 1000
|
||||
自定义业务词条 nz 1000
|
||||
50
src/zqyy/elasticsearch/test-hanlp.sh
Normal file
50
src/zqyy/elasticsearch/test-hanlp.sh
Normal file
@ -0,0 +1,50 @@
|
||||
#!/usr/bin/env sh
|
||||
set -eu
|
||||
|
||||
ES_URL="${ES_URL:-http://localhost:9200}"
|
||||
|
||||
echo "Checking Elasticsearch at ${ES_URL} ..."
|
||||
curl -fsS "${ES_URL}" >/dev/null
|
||||
|
||||
echo "Checking analysis-hanlp plugin ..."
|
||||
PLUGINS="$(curl -fsS "${ES_URL}/_cat/plugins?h=component")"
|
||||
echo "${PLUGINS}"
|
||||
|
||||
if ! echo "${PLUGINS}" | grep -Eq "^(analysis-hanlp|hanlp plugin)$"; then
|
||||
echo "HanLP plugin is not loaded." >&2
|
||||
echo "Loaded plugins:" >&2
|
||||
curl -fsS "${ES_URL}/_cat/plugins?v" >&2 || true
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Testing HanLP analyzer ..."
|
||||
ANALYZE_RESULT="$(
|
||||
curl -fsS -X POST "${ES_URL}/_analyze" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"tokenizer":"hanlp","text":"中华人民共和国国歌"}'
|
||||
)"
|
||||
|
||||
echo "${ANALYZE_RESULT}"
|
||||
|
||||
if echo "${ANALYZE_RESULT}" | grep -q '"tokens"'; then
|
||||
echo "HanLP analyzer is available."
|
||||
else
|
||||
echo "HanLP analyzer test failed." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Testing zqyy custom dictionary ..."
|
||||
CUSTOM_RESULT="$(
|
||||
curl -fsS -X POST "${ES_URL}/_analyze" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"tokenizer":"hanlp","text":"这是自定义业务词条测试"}'
|
||||
)"
|
||||
|
||||
echo "${CUSTOM_RESULT}"
|
||||
|
||||
if echo "${CUSTOM_RESULT}" | grep -q '"token":"自定义业务词条"'; then
|
||||
echo "zqyy custom dictionary is available."
|
||||
else
|
||||
echo "zqyy custom dictionary token was not found in analyze result." >&2
|
||||
exit 1
|
||||
fi
|
||||
Loading…
Reference in New Issue
Block a user