初始化,构建elasticsearch-hanlp-dockerfile

This commit is contained in:
chenchao 2026-06-14 22:21:24 +08:00
commit 3567103a3e
9 changed files with 469 additions and 0 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
/src/zqyy/elasticsearch/*.tar

62
README.md Normal file
View File

@ -0,0 +1,62 @@
# zqyy-dockerfile
This repository keeps Dockerfiles under each image directory and uses one root
`docker-compose.yml` as the unified build/run entrypoint.
## Layout
```text
docker-compose.yml
src/
zqyy/
elasticsearch/
Dockerfile
config/
dictionaries/
README.md
test-hanlp.sh
```
## Build Images
Build one image:
```bash
docker compose build elasticsearch-hanlp
```
Build all images defined in the root compose file:
```bash
docker compose build
```
## Run Services
Run one service:
```bash
docker compose up -d elasticsearch-hanlp
```
Run all services:
```bash
docker compose up -d
```
## Add A New Image
1. Create a new image directory under `src/zqyy/<image-name>/`.
2. Put that image's `Dockerfile` and related files in the image directory.
3. Add a service in the root `docker-compose.yml`.
4. Set `build.context` to the image directory, for example:
```yaml
services:
example:
image: zqyy/example:latest
build:
context: ./src/zqyy/example
dockerfile: Dockerfile
```

39
docker-compose.yml Normal file
View File

@ -0,0 +1,39 @@
services:
elasticsearch-hanlp:
image: zqyy/elasticsearch-hanlp:8.7.0
build:
context: ./src/zqyy/elasticsearch
dockerfile: Dockerfile
args:
ELASTIC_VERSION: "8.7.0"
container_name: zqyy-elasticsearch-hanlp
environment:
discovery.type: single-node
xpack.security.enabled: "false"
xpack.security.enrollment.enabled: "false"
ES_JAVA_OPTS: "-Xms1g -Xmx1g"
ports:
- "9200:9200"
- "9300:9300"
volumes:
- elasticsearch-hanlp-data:/usr/share/elasticsearch/data
# Optional: mount local dictionary/config files during development.
# Rebuild the image for production so every node uses the same files.
# - ./src/zqyy/elasticsearch/dictionaries/custom/zqyy-custom.txt:/usr/share/elasticsearch/plugins/analysis-hanlp/data/dictionary/custom/zqyy-custom.txt:ro
# - ./src/zqyy/elasticsearch/config/hanlp.properties:/usr/share/elasticsearch/config/analysis-hanlp/hanlp.properties:ro
# - ./src/zqyy/elasticsearch/config/hanlp-remote.xml:/usr/share/elasticsearch/config/analysis-hanlp/hanlp-remote.xml:ro
ulimits:
memlock:
soft: -1
hard: -1
nofile:
soft: 65535
hard: 65535
healthcheck:
test: ["CMD-SHELL", "curl -fsS http://localhost:9200/_cluster/health?wait_for_status=yellow&timeout=1s >/dev/null || exit 1"]
interval: 10s
timeout: 5s
retries: 30
volumes:
elasticsearch-hanlp-data:

View File

@ -0,0 +1,47 @@
# syntax=docker/dockerfile:1.7
ARG ELASTIC_VERSION=8.7.0
ARG HANLP_REPO=https://gitee.com/xuxingchao/elasticsearch-analysis-hanlp.git
ARG HANLP_REF=master
FROM maven:3.9.9-eclipse-temurin-17 AS hanlp-builder
ARG ELASTIC_VERSION
ARG HANLP_REPO
ARG HANLP_REF
RUN apt-get update \
&& apt-get install -y --no-install-recommends git unzip \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /build
RUN git clone --depth 1 --branch "${HANLP_REF}" "${HANLP_REPO}" hanlp
WORKDIR /build/hanlp
RUN grep -q "<version>${ELASTIC_VERSION}</version>" pom.xml \
&& mvn -B -DskipTests package \
&& mkdir -p /plugin \
&& unzip -q target/releases/elasticsearch-analysis-hanlp-*.zip -d /plugin \
&& cp -a src/main/resources/data /plugin/analysis-hanlp/data \
&& test -f /plugin/analysis-hanlp/plugin-descriptor.properties
FROM docker.elastic.co/elasticsearch/elasticsearch:${ELASTIC_VERSION}
ARG ELASTIC_VERSION
LABEL org.opencontainers.image.title="zqyy/elasticsearch-hanlp" \
org.opencontainers.image.description="Elasticsearch ${ELASTIC_VERSION} with HanLP analysis plugin" \
org.opencontainers.image.source="https://gitee.com/xuxingchao/elasticsearch-analysis-hanlp"
RUN mkdir -p /usr/share/elasticsearch/config/analysis-hanlp
COPY --from=hanlp-builder --chown=elasticsearch:root /plugin/analysis-hanlp /usr/share/elasticsearch/plugins/analysis-hanlp
COPY --chown=elasticsearch:root config/hanlp.properties /usr/share/elasticsearch/config/analysis-hanlp/hanlp.properties
COPY --chown=elasticsearch:root config/hanlp-remote.xml /usr/share/elasticsearch/config/analysis-hanlp/hanlp-remote.xml
COPY --chown=elasticsearch:root config/hanlp.properties /usr/share/elasticsearch/plugins/analysis-hanlp/config/hanlp.properties
COPY --chown=elasticsearch:root config/hanlp-remote.xml /usr/share/elasticsearch/plugins/analysis-hanlp/config/hanlp-remote.xml
COPY --chown=elasticsearch:root dictionaries/custom/ /usr/share/elasticsearch/plugins/analysis-hanlp/data/dictionary/custom/
RUN /usr/share/elasticsearch/bin/elasticsearch-plugin list | grep -q '^analysis-hanlp$'

View File

@ -0,0 +1,208 @@
# zqyy/elasticsearch-hanlp
Elasticsearch 8.7.0 image with the HanLP analysis plugin.
The HanLP plugin source used by this Dockerfile currently declares Elasticsearch
8.7.0 as its runtime dependency, so the image is intentionally pinned to 8.7.0.
Changing `ELASTIC_VERSION` requires a matching HanLP plugin branch or source
revision.
## Build
From the project root:
```bash
docker compose build elasticsearch-hanlp
```
## Run with Docker Compose
From the project root:
```bash
docker compose up -d elasticsearch-hanlp
```
## HanLP Dictionary Maintenance
This image keeps HanLP files in the same paths used by the plugin:
| Purpose | Source path in this project | Path inside image/container |
| --- | --- | --- |
| HanLP main config | `config/hanlp.properties` | `/usr/share/elasticsearch/config/analysis-hanlp/hanlp.properties` |
| Remote dictionary config | `config/hanlp-remote.xml` | `/usr/share/elasticsearch/config/analysis-hanlp/hanlp-remote.xml` |
| Plugin fallback config | `config/hanlp.properties` | `/usr/share/elasticsearch/plugins/analysis-hanlp/config/hanlp.properties` |
| Business custom dictionary | `dictionaries/custom/zqyy-custom.txt` | `/usr/share/elasticsearch/plugins/analysis-hanlp/data/dictionary/custom/zqyy-custom.txt` |
| Built-in HanLP dictionaries | copied from plugin source `src/main/resources/data` | `/usr/share/elasticsearch/plugins/analysis-hanlp/data` |
The plugin reads `hanlp.properties` from:
```text
/usr/share/elasticsearch/config/analysis-hanlp/hanlp.properties
```
The `root` value in `hanlp.properties` is:
```properties
root=plugins/analysis-hanlp/
```
So dictionary paths are resolved under:
```text
/usr/share/elasticsearch/plugins/analysis-hanlp/
```
### Included Dictionary Files
The Dockerfile copies the common HanLP data shipped with the plugin source into
the image. Important paths include:
```text
data/dictionary/CoreNatureDictionary.txt
data/dictionary/CoreNatureDictionary.ngram.txt
data/dictionary/stopwords.txt
data/dictionary/synonym/CoreSynonym.txt
data/dictionary/custom/CustomDictionary.txt
data/dictionary/custom/ModernChineseSupplementaryWord.txt
data/dictionary/custom/ChinesePlaceName.txt
data/dictionary/custom/PersonalName.txt
data/dictionary/custom/OrganizationName.txt
data/dictionary/custom/ShanghaiPlaceName.txt
data/dictionary/person/nr.txt
data/dictionary/person/nrf.txt
data/dictionary/place/ns.txt
data/dictionary/organization/nt.txt
data/dictionary/tc/
```
This project also adds:
```text
data/dictionary/custom/zqyy-custom.txt
```
and registers it in `CustomDictionaryPath`:
```properties
CustomDictionaryPath=data/dictionary/custom/CustomDictionary.txt; ModernChineseSupplementaryWord.txt; ChinesePlaceName.txt ns; PersonalName.txt; OrganizationName.txt; ShanghaiPlaceName.txt ns; zqyy-custom.txt nz; data/dictionary/person/nrf.txt nrf;
```
### Add Or Change Business Words
Edit:
```text
dictionaries/custom/zqyy-custom.txt
```
Format:
```text
word nature frequency
```
Examples:
```text
zqyy nz 1000
某业务系统 nz 1000
张三 nr 1000
上海浦东 ns 1000
某某科技有限公司 nt 1000
```
Common natures:
| Nature | Meaning |
| --- | --- |
| `nz` | other proper noun, common for products and domain terms |
| `ns` | place name |
| `nr` | person name |
| `nt` | organization name |
| `n` | noun |
| `v` | verb |
After changing local files, rebuild and restart:
```bash
docker compose build --no-cache elasticsearch-hanlp
docker compose up -d elasticsearch-hanlp
```
For local development only, you can bind-mount the dictionary or config files.
See the commented `volumes` lines in the root `docker-compose.yml`. The plugin
supports custom dictionary hot reload, but production images should be rebuilt
so every node has identical dictionary files.
### Remote Dictionaries
Remote dictionaries are configured in:
```text
config/hanlp-remote.xml
```
Example:
```xml
<entry key="remote_ext_dict">http://example.com/hanlp/ext.dic nz</entry>
<entry key="remote_ext_stopwords">http://example.com/hanlp/stopwords.txt</entry>
```
If remote dictionaries are not needed, keep the entries commented out.
## Quick Test
```bash
curl http://localhost:9200/_cat/plugins?v
curl -X POST "http://localhost:9200/_analyze?pretty" \
-H "Content-Type: application/json" \
-d '{
"tokenizer": "hanlp",
"text": "中华人民共和国国歌"
}'
```
## Index Example
```bash
curl -X PUT "http://localhost:9200/hanlp-demo" \
-H "Content-Type: application/json" \
-d '{
"settings": {
"analysis": {
"analyzer": {
"hanlp_analyzer": {
"type": "custom",
"tokenizer": "hanlp"
}
}
}
},
"mappings": {
"properties": {
"title": {
"type": "text",
"analyzer": "hanlp_analyzer",
"search_analyzer": "hanlp_analyzer"
}
}
}
}'
curl -X POST "http://localhost:9200/hanlp-demo/_doc/1?refresh=true" \
-H "Content-Type: application/json" \
-d '{"title":"中华人民共和国国歌"}'
curl -X GET "http://localhost:9200/hanlp-demo/_search?pretty" \
-H "Content-Type: application/json" \
-d '{
"query": {
"match": {
"title": "国歌"
}
}
}'
```

View File

@ -0,0 +1,11 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
<properties>
<comment>HanLP Analyzer remote dictionary configuration</comment>
<!-- Remote extension dictionary. Example: http://example.com/hanlp/ext.dic nz -->
<!--<entry key="remote_ext_dict">words_location</entry>-->
<!-- Remote stop-word dictionary. Example: http://example.com/hanlp/stopwords.txt -->
<!--<entry key="remote_ext_stopwords">stop_words_location</entry>-->
</properties>

View File

@ -0,0 +1,47 @@
# Root path of HanLP data, relative to ES_HOME.
root=plugins/analysis-hanlp/
# Core dictionary path
CoreDictionaryPath=data/dictionary/CoreNatureDictionary.txt
# BiGram dictionary path
BiGramDictionaryPath=data/dictionary/CoreNatureDictionary.ngram.txt
# Core stop word dictionary path
CoreStopWordDictionaryPath=data/dictionary/stopwords.txt
# Core synonym dictionary path
CoreSynonymDictionaryDictionaryPath=data/dictionary/synonym/CoreSynonym.txt
# Person name dictionary path
PersonDictionaryPath=data/dictionary/person/nr.txt
# Person name dictionary tr path
PersonDictionaryTrPath=data/dictionary/person/nr.tr.txt
# Traditional/Simplified Chinese conversion dictionary root path
tcDictionaryRoot=data/dictionary/tc
# Custom dictionary path.
# Paths after the first custom path may be relative to data/dictionary/custom.
# Format: word [nature] [frequency], one word per line in the dictionary file.
CustomDictionaryPath=data/dictionary/custom/CustomDictionary.txt; ModernChineseSupplementaryWord.txt; ChinesePlaceName.txt ns; PersonalName.txt; OrganizationName.txt; ShanghaiPlaceName.txt ns; zqyy-custom.txt nz; data/dictionary/person/nrf.txt nrf;
# HMM segment model path
#HMMSegmentModelPath=data/model/segment/HMMSegmentModel.bin
# Show term nature
#ShowTermNature=true
# IO Adapter
##IOAdapter=com.hankcs.hanlp.corpus.io.FileIOAdapter
# Perceptron model paths. These require the corresponding model files to exist.
PerceptronCWSModelPath=data/model/perceptron/pku1998/cws.bin
PerceptronPOSModelPath=data/model/perceptron/pku1998/pos.bin
PerceptronNERModelPath=data/model/perceptron/pku1998/ner.bin
# CRF model paths. These require the corresponding model files to exist.
CRFCWSModelPath=data/model/crf/pku199801/cws.txt.bin
CRFPOSModelPath=data/model/crf/pku199801/pos.txt.bin
CRFNERModelPath=data/model/crf/pku199801/ner.txt.bin

View File

@ -0,0 +1,4 @@
zqyy nz 1000
HanLP nz 1000
Elasticsearch nz 1000
自定义业务词条 nz 1000

View File

@ -0,0 +1,50 @@
#!/usr/bin/env sh
set -eu
ES_URL="${ES_URL:-http://localhost:9200}"
echo "Checking Elasticsearch at ${ES_URL} ..."
curl -fsS "${ES_URL}" >/dev/null
echo "Checking analysis-hanlp plugin ..."
PLUGINS="$(curl -fsS "${ES_URL}/_cat/plugins?h=component")"
echo "${PLUGINS}"
if ! echo "${PLUGINS}" | grep -Eq "^(analysis-hanlp|hanlp plugin)$"; then
echo "HanLP plugin is not loaded." >&2
echo "Loaded plugins:" >&2
curl -fsS "${ES_URL}/_cat/plugins?v" >&2 || true
exit 1
fi
echo "Testing HanLP analyzer ..."
ANALYZE_RESULT="$(
curl -fsS -X POST "${ES_URL}/_analyze" \
-H "Content-Type: application/json" \
-d '{"tokenizer":"hanlp","text":"中华人民共和国国歌"}'
)"
echo "${ANALYZE_RESULT}"
if echo "${ANALYZE_RESULT}" | grep -q '"tokens"'; then
echo "HanLP analyzer is available."
else
echo "HanLP analyzer test failed." >&2
exit 1
fi
echo "Testing zqyy custom dictionary ..."
CUSTOM_RESULT="$(
curl -fsS -X POST "${ES_URL}/_analyze" \
-H "Content-Type: application/json" \
-d '{"tokenizer":"hanlp","text":"这是自定义业务词条测试"}'
)"
echo "${CUSTOM_RESULT}"
if echo "${CUSTOM_RESULT}" | grep -q '"token":"自定义业务词条"'; then
echo "zqyy custom dictionary is available."
else
echo "zqyy custom dictionary token was not found in analyze result." >&2
exit 1
fi