commit 3567103a3e347194ad5c8390e06edb04ca8f47f4 Author: chenchao Date: Sun Jun 14 22:21:24 2026 +0800 初始化,构建elasticsearch-hanlp-dockerfile diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..71c5d21 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/src/zqyy/elasticsearch/*.tar diff --git a/README.md b/README.md new file mode 100644 index 0000000..2261ca0 --- /dev/null +++ b/README.md @@ -0,0 +1,62 @@ +# zqyy-dockerfile + +This repository keeps Dockerfiles under each image directory and uses one root +`docker-compose.yml` as the unified build/run entrypoint. + +## Layout + +```text +docker-compose.yml +src/ + zqyy/ + elasticsearch/ + Dockerfile + config/ + dictionaries/ + README.md + test-hanlp.sh +``` + +## Build Images + +Build one image: + +```bash +docker compose build elasticsearch-hanlp +``` + +Build all images defined in the root compose file: + +```bash +docker compose build +``` + +## Run Services + +Run one service: + +```bash +docker compose up -d elasticsearch-hanlp +``` + +Run all services: + +```bash +docker compose up -d +``` + +## Add A New Image + +1. Create a new image directory under `src/zqyy//`. +2. Put that image's `Dockerfile` and related files in the image directory. +3. Add a service in the root `docker-compose.yml`. +4. Set `build.context` to the image directory, for example: + +```yaml +services: + example: + image: zqyy/example:latest + build: + context: ./src/zqyy/example + dockerfile: Dockerfile +``` diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..1188059 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,39 @@ +services: + elasticsearch-hanlp: + image: zqyy/elasticsearch-hanlp:8.7.0 + build: + context: ./src/zqyy/elasticsearch + dockerfile: Dockerfile + args: + ELASTIC_VERSION: "8.7.0" + container_name: zqyy-elasticsearch-hanlp + environment: + discovery.type: single-node + xpack.security.enabled: "false" + xpack.security.enrollment.enabled: "false" + ES_JAVA_OPTS: "-Xms1g -Xmx1g" + ports: + - "9200:9200" + - "9300:9300" + volumes: + - elasticsearch-hanlp-data:/usr/share/elasticsearch/data + # Optional: mount local dictionary/config files during development. + # Rebuild the image for production so every node uses the same files. + # - ./src/zqyy/elasticsearch/dictionaries/custom/zqyy-custom.txt:/usr/share/elasticsearch/plugins/analysis-hanlp/data/dictionary/custom/zqyy-custom.txt:ro + # - ./src/zqyy/elasticsearch/config/hanlp.properties:/usr/share/elasticsearch/config/analysis-hanlp/hanlp.properties:ro + # - ./src/zqyy/elasticsearch/config/hanlp-remote.xml:/usr/share/elasticsearch/config/analysis-hanlp/hanlp-remote.xml:ro + ulimits: + memlock: + soft: -1 + hard: -1 + nofile: + soft: 65535 + hard: 65535 + healthcheck: + test: ["CMD-SHELL", "curl -fsS http://localhost:9200/_cluster/health?wait_for_status=yellow&timeout=1s >/dev/null || exit 1"] + interval: 10s + timeout: 5s + retries: 30 + +volumes: + elasticsearch-hanlp-data: diff --git a/src/zqyy/elasticsearch/Dockerfile b/src/zqyy/elasticsearch/Dockerfile new file mode 100644 index 0000000..a1d1a30 --- /dev/null +++ b/src/zqyy/elasticsearch/Dockerfile @@ -0,0 +1,47 @@ +# syntax=docker/dockerfile:1.7 + +ARG ELASTIC_VERSION=8.7.0 +ARG HANLP_REPO=https://gitee.com/xuxingchao/elasticsearch-analysis-hanlp.git +ARG HANLP_REF=master + +FROM maven:3.9.9-eclipse-temurin-17 AS hanlp-builder + +ARG ELASTIC_VERSION +ARG HANLP_REPO +ARG HANLP_REF + +RUN apt-get update \ + && apt-get install -y --no-install-recommends git unzip \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /build + +RUN git clone --depth 1 --branch "${HANLP_REF}" "${HANLP_REPO}" hanlp + +WORKDIR /build/hanlp + +RUN grep -q "${ELASTIC_VERSION}" pom.xml \ + && mvn -B -DskipTests package \ + && mkdir -p /plugin \ + && unzip -q target/releases/elasticsearch-analysis-hanlp-*.zip -d /plugin \ + && cp -a src/main/resources/data /plugin/analysis-hanlp/data \ + && test -f /plugin/analysis-hanlp/plugin-descriptor.properties + +FROM docker.elastic.co/elasticsearch/elasticsearch:${ELASTIC_VERSION} + +ARG ELASTIC_VERSION + +LABEL org.opencontainers.image.title="zqyy/elasticsearch-hanlp" \ + org.opencontainers.image.description="Elasticsearch ${ELASTIC_VERSION} with HanLP analysis plugin" \ + org.opencontainers.image.source="https://gitee.com/xuxingchao/elasticsearch-analysis-hanlp" + +RUN mkdir -p /usr/share/elasticsearch/config/analysis-hanlp + +COPY --from=hanlp-builder --chown=elasticsearch:root /plugin/analysis-hanlp /usr/share/elasticsearch/plugins/analysis-hanlp +COPY --chown=elasticsearch:root config/hanlp.properties /usr/share/elasticsearch/config/analysis-hanlp/hanlp.properties +COPY --chown=elasticsearch:root config/hanlp-remote.xml /usr/share/elasticsearch/config/analysis-hanlp/hanlp-remote.xml +COPY --chown=elasticsearch:root config/hanlp.properties /usr/share/elasticsearch/plugins/analysis-hanlp/config/hanlp.properties +COPY --chown=elasticsearch:root config/hanlp-remote.xml /usr/share/elasticsearch/plugins/analysis-hanlp/config/hanlp-remote.xml +COPY --chown=elasticsearch:root dictionaries/custom/ /usr/share/elasticsearch/plugins/analysis-hanlp/data/dictionary/custom/ + +RUN /usr/share/elasticsearch/bin/elasticsearch-plugin list | grep -q '^analysis-hanlp$' diff --git a/src/zqyy/elasticsearch/README.md b/src/zqyy/elasticsearch/README.md new file mode 100644 index 0000000..de38841 --- /dev/null +++ b/src/zqyy/elasticsearch/README.md @@ -0,0 +1,208 @@ +# zqyy/elasticsearch-hanlp + +Elasticsearch 8.7.0 image with the HanLP analysis plugin. + +The HanLP plugin source used by this Dockerfile currently declares Elasticsearch +8.7.0 as its runtime dependency, so the image is intentionally pinned to 8.7.0. +Changing `ELASTIC_VERSION` requires a matching HanLP plugin branch or source +revision. + +## Build + +From the project root: + +```bash +docker compose build elasticsearch-hanlp +``` + +## Run with Docker Compose + +From the project root: + +```bash +docker compose up -d elasticsearch-hanlp +``` + +## HanLP Dictionary Maintenance + +This image keeps HanLP files in the same paths used by the plugin: + +| Purpose | Source path in this project | Path inside image/container | +| --- | --- | --- | +| HanLP main config | `config/hanlp.properties` | `/usr/share/elasticsearch/config/analysis-hanlp/hanlp.properties` | +| Remote dictionary config | `config/hanlp-remote.xml` | `/usr/share/elasticsearch/config/analysis-hanlp/hanlp-remote.xml` | +| Plugin fallback config | `config/hanlp.properties` | `/usr/share/elasticsearch/plugins/analysis-hanlp/config/hanlp.properties` | +| Business custom dictionary | `dictionaries/custom/zqyy-custom.txt` | `/usr/share/elasticsearch/plugins/analysis-hanlp/data/dictionary/custom/zqyy-custom.txt` | +| Built-in HanLP dictionaries | copied from plugin source `src/main/resources/data` | `/usr/share/elasticsearch/plugins/analysis-hanlp/data` | + +The plugin reads `hanlp.properties` from: + +```text +/usr/share/elasticsearch/config/analysis-hanlp/hanlp.properties +``` + +The `root` value in `hanlp.properties` is: + +```properties +root=plugins/analysis-hanlp/ +``` + +So dictionary paths are resolved under: + +```text +/usr/share/elasticsearch/plugins/analysis-hanlp/ +``` + +### Included Dictionary Files + +The Dockerfile copies the common HanLP data shipped with the plugin source into +the image. Important paths include: + +```text +data/dictionary/CoreNatureDictionary.txt +data/dictionary/CoreNatureDictionary.ngram.txt +data/dictionary/stopwords.txt +data/dictionary/synonym/CoreSynonym.txt +data/dictionary/custom/CustomDictionary.txt +data/dictionary/custom/ModernChineseSupplementaryWord.txt +data/dictionary/custom/ChinesePlaceName.txt +data/dictionary/custom/PersonalName.txt +data/dictionary/custom/OrganizationName.txt +data/dictionary/custom/ShanghaiPlaceName.txt +data/dictionary/person/nr.txt +data/dictionary/person/nrf.txt +data/dictionary/place/ns.txt +data/dictionary/organization/nt.txt +data/dictionary/tc/ +``` + +This project also adds: + +```text +data/dictionary/custom/zqyy-custom.txt +``` + +and registers it in `CustomDictionaryPath`: + +```properties +CustomDictionaryPath=data/dictionary/custom/CustomDictionary.txt; ModernChineseSupplementaryWord.txt; ChinesePlaceName.txt ns; PersonalName.txt; OrganizationName.txt; ShanghaiPlaceName.txt ns; zqyy-custom.txt nz; data/dictionary/person/nrf.txt nrf; +``` + +### Add Or Change Business Words + +Edit: + +```text +dictionaries/custom/zqyy-custom.txt +``` + +Format: + +```text +word nature frequency +``` + +Examples: + +```text +zqyy nz 1000 +某业务系统 nz 1000 +张三 nr 1000 +上海浦东 ns 1000 +某某科技有限公司 nt 1000 +``` + +Common natures: + +| Nature | Meaning | +| --- | --- | +| `nz` | other proper noun, common for products and domain terms | +| `ns` | place name | +| `nr` | person name | +| `nt` | organization name | +| `n` | noun | +| `v` | verb | + +After changing local files, rebuild and restart: + +```bash +docker compose build --no-cache elasticsearch-hanlp +docker compose up -d elasticsearch-hanlp +``` + +For local development only, you can bind-mount the dictionary or config files. +See the commented `volumes` lines in the root `docker-compose.yml`. The plugin +supports custom dictionary hot reload, but production images should be rebuilt +so every node has identical dictionary files. + +### Remote Dictionaries + +Remote dictionaries are configured in: + +```text +config/hanlp-remote.xml +``` + +Example: + +```xml +http://example.com/hanlp/ext.dic nz +http://example.com/hanlp/stopwords.txt +``` + +If remote dictionaries are not needed, keep the entries commented out. + +## Quick Test + +```bash +curl http://localhost:9200/_cat/plugins?v + +curl -X POST "http://localhost:9200/_analyze?pretty" \ + -H "Content-Type: application/json" \ + -d '{ + "tokenizer": "hanlp", + "text": "中华人民共和国国歌" + }' +``` + +## Index Example + +```bash +curl -X PUT "http://localhost:9200/hanlp-demo" \ + -H "Content-Type: application/json" \ + -d '{ + "settings": { + "analysis": { + "analyzer": { + "hanlp_analyzer": { + "type": "custom", + "tokenizer": "hanlp" + } + } + } + }, + "mappings": { + "properties": { + "title": { + "type": "text", + "analyzer": "hanlp_analyzer", + "search_analyzer": "hanlp_analyzer" + } + } + } + }' + +curl -X POST "http://localhost:9200/hanlp-demo/_doc/1?refresh=true" \ + -H "Content-Type: application/json" \ + -d '{"title":"中华人民共和国国歌"}' + +curl -X GET "http://localhost:9200/hanlp-demo/_search?pretty" \ + -H "Content-Type: application/json" \ + -d '{ + "query": { + "match": { + "title": "国歌" + } + } + }' +``` diff --git a/src/zqyy/elasticsearch/config/hanlp-remote.xml b/src/zqyy/elasticsearch/config/hanlp-remote.xml new file mode 100644 index 0000000..b97ac4a --- /dev/null +++ b/src/zqyy/elasticsearch/config/hanlp-remote.xml @@ -0,0 +1,11 @@ + + + + HanLP Analyzer remote dictionary configuration + + + + + + + diff --git a/src/zqyy/elasticsearch/config/hanlp.properties b/src/zqyy/elasticsearch/config/hanlp.properties new file mode 100644 index 0000000..2391b6f --- /dev/null +++ b/src/zqyy/elasticsearch/config/hanlp.properties @@ -0,0 +1,47 @@ +# Root path of HanLP data, relative to ES_HOME. +root=plugins/analysis-hanlp/ + +# Core dictionary path +CoreDictionaryPath=data/dictionary/CoreNatureDictionary.txt + +# BiGram dictionary path +BiGramDictionaryPath=data/dictionary/CoreNatureDictionary.ngram.txt + +# Core stop word dictionary path +CoreStopWordDictionaryPath=data/dictionary/stopwords.txt + +# Core synonym dictionary path +CoreSynonymDictionaryDictionaryPath=data/dictionary/synonym/CoreSynonym.txt + +# Person name dictionary path +PersonDictionaryPath=data/dictionary/person/nr.txt + +# Person name dictionary tr path +PersonDictionaryTrPath=data/dictionary/person/nr.tr.txt + +# Traditional/Simplified Chinese conversion dictionary root path +tcDictionaryRoot=data/dictionary/tc + +# Custom dictionary path. +# Paths after the first custom path may be relative to data/dictionary/custom. +# Format: word [nature] [frequency], one word per line in the dictionary file. +CustomDictionaryPath=data/dictionary/custom/CustomDictionary.txt; ModernChineseSupplementaryWord.txt; ChinesePlaceName.txt ns; PersonalName.txt; OrganizationName.txt; ShanghaiPlaceName.txt ns; zqyy-custom.txt nz; data/dictionary/person/nrf.txt nrf; + +# HMM segment model path +#HMMSegmentModelPath=data/model/segment/HMMSegmentModel.bin + +# Show term nature +#ShowTermNature=true + +# IO Adapter +##IOAdapter=com.hankcs.hanlp.corpus.io.FileIOAdapter + +# Perceptron model paths. These require the corresponding model files to exist. +PerceptronCWSModelPath=data/model/perceptron/pku1998/cws.bin +PerceptronPOSModelPath=data/model/perceptron/pku1998/pos.bin +PerceptronNERModelPath=data/model/perceptron/pku1998/ner.bin + +# CRF model paths. These require the corresponding model files to exist. +CRFCWSModelPath=data/model/crf/pku199801/cws.txt.bin +CRFPOSModelPath=data/model/crf/pku199801/pos.txt.bin +CRFNERModelPath=data/model/crf/pku199801/ner.txt.bin diff --git a/src/zqyy/elasticsearch/dictionaries/custom/zqyy-custom.txt b/src/zqyy/elasticsearch/dictionaries/custom/zqyy-custom.txt new file mode 100644 index 0000000..e489f24 --- /dev/null +++ b/src/zqyy/elasticsearch/dictionaries/custom/zqyy-custom.txt @@ -0,0 +1,4 @@ +zqyy nz 1000 +HanLP nz 1000 +Elasticsearch nz 1000 +自定义业务词条 nz 1000 diff --git a/src/zqyy/elasticsearch/test-hanlp.sh b/src/zqyy/elasticsearch/test-hanlp.sh new file mode 100644 index 0000000..86aca68 --- /dev/null +++ b/src/zqyy/elasticsearch/test-hanlp.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env sh +set -eu + +ES_URL="${ES_URL:-http://localhost:9200}" + +echo "Checking Elasticsearch at ${ES_URL} ..." +curl -fsS "${ES_URL}" >/dev/null + +echo "Checking analysis-hanlp plugin ..." +PLUGINS="$(curl -fsS "${ES_URL}/_cat/plugins?h=component")" +echo "${PLUGINS}" + +if ! echo "${PLUGINS}" | grep -Eq "^(analysis-hanlp|hanlp plugin)$"; then + echo "HanLP plugin is not loaded." >&2 + echo "Loaded plugins:" >&2 + curl -fsS "${ES_URL}/_cat/plugins?v" >&2 || true + exit 1 +fi + +echo "Testing HanLP analyzer ..." +ANALYZE_RESULT="$( + curl -fsS -X POST "${ES_URL}/_analyze" \ + -H "Content-Type: application/json" \ + -d '{"tokenizer":"hanlp","text":"中华人民共和国国歌"}' +)" + +echo "${ANALYZE_RESULT}" + +if echo "${ANALYZE_RESULT}" | grep -q '"tokens"'; then + echo "HanLP analyzer is available." +else + echo "HanLP analyzer test failed." >&2 + exit 1 +fi + +echo "Testing zqyy custom dictionary ..." +CUSTOM_RESULT="$( + curl -fsS -X POST "${ES_URL}/_analyze" \ + -H "Content-Type: application/json" \ + -d '{"tokenizer":"hanlp","text":"这是自定义业务词条测试"}' +)" + +echo "${CUSTOM_RESULT}" + +if echo "${CUSTOM_RESULT}" | grep -q '"token":"自定义业务词条"'; then + echo "zqyy custom dictionary is available." +else + echo "zqyy custom dictionary token was not found in analyze result." >&2 + exit 1 +fi