-
Notifications
You must be signed in to change notification settings - Fork 5
/
Dockerfile
71 lines (56 loc) · 2.02 KB
/
Dockerfile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# Use ARG for the compute level in the builder stage
ARG CUDA_VERSION="12.6.2"
ARG OS="ubuntu24.04"
ARG COMPUTE_LEVEL="86"
ARG CUDA_BUILDER_IMAGE="${CUDA_VERSION}-devel-${OS}"
ARG CUDA_RUNTIME_IMAGE="${CUDA_VERSION}-runtime-${OS}"
# Builder Stage: Compiling and building
FROM nvidia/cuda:${CUDA_BUILDER_IMAGE} AS builder
# Set the compute level as an environment variable to be used later
ARG COMPUTE_LEVEL
ENV COMPUTE_LEVEL=${COMPUTE_LEVEL}
# Install build dependencies
RUN apt update && \
apt install -y --no-install-recommends \
build-essential \
cmake \
libpoppler-cpp-dev \
pkg-config \
git \
poppler-utils \
&& apt clean \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /build
# Clone and build the project
RUN git clone https://github.com/ggerganov/llama.cpp && \
cd llama.cpp && \
# Echo the value to debug the variable substitution
echo "Using compute level: compute_${COMPUTE_LEVEL}" && \
CUDA_DOCKER_ARCH="compute_${COMPUTE_LEVEL}" make GGML_CUDA=1 -j 8 && \
find . -maxdepth 1 \( -name "llama-*" -o -name "ggml" -o -name "examples" -o -name "models" \) ! -name "llama-server" -exec rm -rf {} +
# Runtime Stage: Setting up the runtime environment
FROM nvidia/cuda:${CUDA_RUNTIME_IMAGE} AS runtime
# Install runtime dependencies
RUN apt update && \
apt install -y --no-install-recommends \
python3-pip \
python-is-python3 \
ocrmypdf \
tesseract-ocr-deu \
&& apt clean \
&& rm -rf /var/lib/apt/lists/*
# Set the working directory
WORKDIR /build
# Copy the built artifacts from the builder stage
COPY --from=builder /build/llama.cpp .
# Set the working directory for the application
WORKDIR /app
# Copy the requirements and install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir --break-system-packages -r requirements.txt
# Copy the rest of the application code
COPY . .
# Expose the application port
EXPOSE 5000
# Command to run the application
CMD ["python", "app.py", "--server_path", "/build/llama-server", "--model_path", "/models"]