diff --git a/docker/Dockerfile b/docker/Dockerfile index 0ac6e31..1cbcaf6 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -73,13 +73,19 @@ COPY scripts/comfyui /etc/init.d/comfyui RUN chmod +x /etc/init.d/comfyui && \ update-rc.d comfyui defaults +COPY ./scripts/mgpu /usr/local/bin/mgpu +RUN chmod +x /usr/local/bin/mgpu + +COPY ./scripts/setup_symlinks.sh /usr/local/bin/setup_symlinks.sh +RUN chmod +x /usr/local/bin/setup_symlinks.sh + # Copy startup script COPY scripts/start.sh /scripts/start.sh RUN chmod +x /scripts/start.sh COPY scripts ${ROOT}/scripts -RUN rm -rf ${ROOT}/scripts/start.sh && rm -rf ${ROOT}/scripts/comfyui +RUN rm -rf ${ROOT}/scripts/start.sh && rm -rf ${ROOT}/scripts/comfyui && rm -rf ${ROOT}/scripts/mgpu && rm -rf ${ROOT}/scripts/setup_symlinks.sh # RUN usermod -aG crontab ubuntu diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 310d47d..f80b7f2 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -15,6 +15,8 @@ x-base-service: &base-service dockerfile: Dockerfile platforms: - linux/amd64 + environment: + - PATH=/usr/local/bin:$PATH # Ensure /usr/local/bin is in the PATH restart: unless-stopped platform: linux/amd64 # deploy: diff --git a/docker/scripts/comfyui b/docker/scripts/comfyui index 2422610..8d22b1f 100644 --- a/docker/scripts/comfyui +++ b/docker/scripts/comfyui @@ -10,95 +10,79 @@ # Paths ROOT="${ROOT:-/workspace}" -COMFYUI_HOME="$ROOT/.comfyui" -LOG_DIR="$COMFYUI_HOME/log" -DEBUG_LOG="$LOG_DIR/debug.log" -OUTPUT_LOG="$LOG_DIR/output.log" - -# ComfyUI specific paths -COMFY_DIR="${COMFY_DIR:-${ROOT}/ComfyUI}" -WORK_DIR="$COMFY_DIR" -SCRIPT_PATH="$COMFY_DIR/main.py" -PID_FILE="$COMFYUI_HOME/comfyui.pid" - -# Ensure log directory exists with correct permissions -prepare_logs() { - mkdir -p "$COMFYUI_HOME/log" - chmod 755 "$COMFYUI_HOME" - chmod 755 "$COMFYUI_HOME/log" -} +NUM_GPUS="${NUM_GPUS:-1}" log() { + local GPU_ID=$1 + shift + LOG_DIR="${ROOT}/comfyui_gpu${GPU_ID}/logs" + DEBUG_LOG="${LOG_DIR}/debug.log" echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >> "$DEBUG_LOG" } +prepare_logs() { + local GPU_ID=$1 + LOG_DIR="${ROOT}/comfyui_gpu${GPU_ID}/logs" + mkdir -p "$LOG_DIR" + chmod 755 "$LOG_DIR" +} + start() { + local GPU_ID=$1 + + # Validate GPU ID + if ! [[ "$GPU_ID" =~ ^[0-9]+$ ]] || [ "$GPU_ID" -ge "$NUM_GPUS" ]; then + echo "Error: Invalid GPU ID $GPU_ID. Must be between 0 and $((NUM_GPUS-1))" + return 1 + fi + + # Set GPU-specific paths + local GPU_DIR="${ROOT}/comfyui_gpu${GPU_ID}" + local SCRIPT_PATH="${GPU_DIR}/main.py" + local PID_FILE="${GPU_DIR}/comfyui.pid" + local OUTPUT_LOG="${GPU_DIR}/logs/output.log" + # Prepare log directory - prepare_logs + prepare_logs "$GPU_ID" # Extensive logging - log "=== ComfyUI Service Start Diagnostics ===" - log "Current User: $(whoami)" - log "Current Working Directory: $(pwd)" - log "Script Path: $SCRIPT_PATH" - log "Working Directory: $WORK_DIR" - log "Log Directory: $LOG_DIR" - log "Debug Log: $DEBUG_LOG" - log "Output Log: $OUTPUT_LOG" + log "$GPU_ID" "=== ComfyUI Service Start Diagnostics ===" + log "$GPU_ID" "Current User: $(whoami)" + log "$GPU_ID" "GPU ID: $GPU_ID" + log "$GPU_ID" "Script Path: $SCRIPT_PATH" + log "$GPU_ID" "Working Directory: $GPU_DIR" + log "$GPU_ID" "Log Directory: ${GPU_DIR}/logs" # Validate paths - if [ ! -d "$WORK_DIR" ]; then - log "ERROR: Working directory does not exist: $WORK_DIR" + if [ ! -d "$GPU_DIR" ]; then + log "$GPU_ID" "ERROR: Working directory does not exist: $GPU_DIR" return 1 fi if [ ! -f "$SCRIPT_PATH" ]; then - log "ERROR: ComfyUI main script not found at $SCRIPT_PATH" + log "$GPU_ID" "ERROR: ComfyUI main script not found at $SCRIPT_PATH" return 1 fi # Verify Python environment ACTIVE_PYTHON=$(which python) - log "Active Python Path: $ACTIVE_PYTHON" + log "$GPU_ID" "Active Python Path: $ACTIVE_PYTHON" python --version - # Check for CUDA availability - CUDA_CHECK=$(python -c " -import torch -try: - cuda_available = torch.cuda.is_available() - device_count = torch.cuda.device_count() - print(f'CUDA_AVAILABLE:{cuda_available}') - print(f'CUDA_DEVICE_COUNT:{device_count}') -except Exception as e: - print(f'CUDA_ERROR:{str(e)}') -") - - # Parse CUDA check results - CUDA_AVAILABLE=$(echo "$CUDA_CHECK" | grep "CUDA_AVAILABLE:" | cut -d: -f2) - CUDA_DEVICE_COUNT=$(echo "$CUDA_CHECK" | grep "CUDA_DEVICE_COUNT:" | cut -d: -f2) - CUDA_ERROR=$(echo "$CUDA_CHECK" | grep "CUDA_ERROR:" | cut -d: -f2) - - # Log CUDA detection results - log "CUDA Check Results:" - log "CUDA Available: $CUDA_AVAILABLE" - log "CUDA Device Count: $CUDA_DEVICE_COUNT" - + # Set GPU-specific environment variables + export CUDA_VISIBLE_DEVICES=$GPU_ID + PORT="${PORT:-$((8188 + GPU_ID))}" + # Prepare run command - PORT="${PORT:-8188}" - RUN_COMMAND=(python "$SCRIPT_PATH" --listen 127.0.0.1 --port "$PORT") + RUN_COMMAND=(python "$SCRIPT_PATH" --listen 0.0.0.0 --port "$PORT") - # If no CUDA, force CPU mode - if [ "$CUDA_AVAILABLE" != "True" ] || [ "$CUDA_DEVICE_COUNT" = "0" ]; then - log "No CUDA devices found. Forcing CPU mode." - RUN_COMMAND+=("--cpu") - export CUDA_VISIBLE_DEVICES="" - fi - # Log the exact command being run - log "Executing ComfyUI start command: ${RUN_COMMAND[@]}" + log "$GPU_ID" "Executing ComfyUI start command: ${RUN_COMMAND[@]}" + log "$GPU_ID" "Using GPU: $GPU_ID (CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES)" + log "$GPU_ID" "Port: $PORT" - # Start ComfyUI as ubuntu user + # Start ComfyUI + cd "$GPU_DIR" PYTHONUNBUFFERED=1 \ "${RUN_COMMAND[@]}" >> "$OUTPUT_LOG" 2>&1 & local pid=$! @@ -109,63 +93,91 @@ except Exception as e: # Check if process is running if kill -0 "$pid" 2>/dev/null; then echo "$pid" > "$PID_FILE" - log "ComfyUI started with PID $pid. Full command: ${RUN_COMMAND[@]}" + log "$GPU_ID" "ComfyUI started with PID $pid. Full command: ${RUN_COMMAND[@]}" return 0 else - log "Failed to start ComfyUI. Command used: ${RUN_COMMAND[@]}" + log "$GPU_ID" "Failed to start ComfyUI. Command used: ${RUN_COMMAND[@]}" return 1 fi } stop() { - log "Stopping ComfyUI service..." + local GPU_ID=$1 + local GPU_DIR="${ROOT}/comfyui_gpu${GPU_ID}" + local PID_FILE="${GPU_DIR}/comfyui.pid" + + log "$GPU_ID" "Stopping ComfyUI service for GPU $GPU_ID..." if [ -f "$PID_FILE" ]; then PID=$(cat "$PID_FILE") if kill -0 "$PID" 2>/dev/null; then kill "$PID" - log "Sent termination signal to ComfyUI (PID: $PID)" + log "$GPU_ID" "Sent termination signal to ComfyUI (PID: $PID)" else - log "No running ComfyUI process found" + log "$GPU_ID" "No running ComfyUI process found" fi rm -f "$PID_FILE" else - log "No PID file found" + log "$GPU_ID" "No PID file found" fi } status() { + local GPU_ID=$1 + local GPU_DIR="${ROOT}/comfyui_gpu${GPU_ID}" + local PID_FILE="${GPU_DIR}/comfyui.pid" + if [ -f "$PID_FILE" ]; then PID=$(cat "$PID_FILE") if kill -0 "$PID" 2>/dev/null; then - echo "ComfyUI is running (PID: $PID)" + echo "ComfyUI is running on GPU $GPU_ID (PID: $PID)" return 0 else - echo "ComfyUI is not running (stale PID file)" + echo "ComfyUI is not running on GPU $GPU_ID (stale PID file)" return 1 fi else - echo "ComfyUI is not running" + echo "ComfyUI is not running on GPU $GPU_ID" return 1 fi } case "$1" in start) - start + if [ -z "$2" ]; then + echo "Error: GPU ID required" + echo "Usage: $0 start " + exit 1 + fi + start "$2" ;; stop) - stop + if [ -z "$2" ]; then + echo "Error: GPU ID required" + echo "Usage: $0 stop " + exit 1 + fi + stop "$2" ;; status) - status + if [ -z "$2" ]; then + echo "Error: GPU ID required" + echo "Usage: $0 status " + exit 1 + fi + status "$2" ;; restart) - stop - start + if [ -z "$2" ]; then + echo "Error: GPU ID required" + echo "Usage: $0 restart " + exit 1 + fi + stop "$2" + start "$2" ;; *) - echo "Usage: $0 {start|stop|status|restart}" + echo "Usage: $0 {start|stop|status|restart} " exit 1 esac diff --git a/docker/scripts/download_model.py b/docker/scripts/download_model.py deleted file mode 100755 index 1cb3112..0000000 --- a/docker/scripts/download_model.py +++ /dev/null @@ -1,109 +0,0 @@ -#!/usr/bin/env python3 -import os -import sys -import logging -from typing import Optional -from urllib.parse import urlparse -import subprocess - -# Configure logging -logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(levelname)s - %(message)s', - handlers=[ - logging.StreamHandler(sys.stdout), - logging.FileHandler('/tmp/model_download.log') - ] -) -logger = logging.getLogger(__name__) - -def try_huggingface_download(url: str, output_path: str, filename: str) -> bool: - """ - Attempt to download using Hugging Face Hub API. - Returns True if successful, False otherwise. - """ - try: - from huggingface_hub import hf_hub_download - from huggingface_hub.utils import enable_progress_bars - - enable_progress_bars() - - # Parse HF URL to get repo_id and filename - parsed = urlparse(url) - path_parts = [p for p in parsed.path.split('/') if p] - - if len(path_parts) < 4: # Need at least org/repo/resolve/filename - return False - - repo_id = f"{path_parts[0]}/{path_parts[1]}" - revision = path_parts[3] if path_parts[2] in ['resolve', 'raw'] else None - hf_filename = path_parts[-1] - - logger.info(f"Attempting HF download: repo={repo_id}, file={hf_filename}, rev={revision}") - - # Download using hf_hub_download - downloaded_path = hf_hub_download( - repo_id=repo_id, - filename=hf_filename, - revision=revision, - local_dir=output_path - ) - - # Move to final location if needed - final_path = os.path.join(output_path, filename) - if downloaded_path != final_path: - os.rename(downloaded_path, final_path) - - return os.path.exists(final_path) - - except Exception as e: - logger.error(f"HF download failed: {str(e)}") - return False - -def download_with_wget(url: str, output_path: str, filename: str) -> bool: - """ - Download file using wget. - Returns True if successful, False otherwise. - """ - try: - output_file = os.path.join(output_path, filename) - command = ['wget', '--progress=bar:force:noscroll', '-O', output_file, url] - - logger.info(f"Downloading with wget: {url}") - result = subprocess.run(command, capture_output=True, text=True) - - return result.returncode == 0 and os.path.exists(output_file) - - except Exception as e: - logger.error(f"wget download failed: {str(e)}") - return False - -def main(): - if len(sys.argv) != 4: - logger.error("Usage: download_model.py ") - return 1 - - url = sys.argv[1] - output_path = sys.argv[2] - filename = sys.argv[3] - - # Create output directory if it doesn't exist - os.makedirs(output_path, exist_ok=True) - - # Try Hugging Face download first if it's a HF URL - if 'huggingface.co' in url: - if try_huggingface_download(url, output_path, filename): - logger.info("Successfully downloaded using Hugging Face") - return 0 - logger.warning("Hugging Face download failed, falling back to wget") - - # Fall back to wget - if download_with_wget(url, output_path, filename): - logger.info("Successfully downloaded using wget") - return 0 - - logger.error("All download attempts failed") - return 1 - -if __name__ == "__main__": - sys.exit(main()) diff --git a/docker/scripts/mgpu b/docker/scripts/mgpu new file mode 100644 index 0000000..36d24a9 --- /dev/null +++ b/docker/scripts/mgpu @@ -0,0 +1,219 @@ +#!/bin/bash + +# Base directory for your ComfyUI instances +ROOT="${ROOT:-/workspace}" +NUM_GPUS="${NUM_GPUS:-1}" # Use the environment variable set during startup + +# Validate GPU ID +validate_gpu_id() { + local gpu_id=$1 + + if ! [[ "$gpu_id" =~ ^[0-9]+$ ]]; then + echo "Error: GPU ID must be a number" + return 1 + fi + + if [ "$gpu_id" -ge "$NUM_GPUS" ]; then + echo "Error: GPU ID $gpu_id is not available. Only $NUM_GPUS GPU(s) configured." + return 1 + fi +} + +# Function to show logs for a specific GPU +show_logs() { + local gpu_id=$1 + LOG_DIR="$ROOT/comfyui_gpu${gpu_id}/logs" + DEBUG_LOG="$LOG_DIR/debug.log" + OUTPUT_LOG="$LOG_DIR/output.log" + + echo "=== Showing logs for GPU $gpu_id ===" + if [ -f "$DEBUG_LOG" ]; then + echo "=== Debug Log ===" + tail -n 50 "$DEBUG_LOG" + else + echo "No debug log found." + fi + + if [ -f "$OUTPUT_LOG" ]; then + echo "=== Output Log ===" + tail -n 50 "$OUTPUT_LOG" + else + echo "No output log found." + fi +} + +# Function to show logs for all GPUs +show_all_logs() { + for gpu in $(seq 0 $((NUM_GPUS-1))); do + show_logs "$gpu" + echo "----------------------------------------" + done +} + +# Function to restart a specific GPU service +restart_service() { + local gpu_id=$1 + echo "Restarting service for GPU $gpu_id..." + service "comfyui_gpu${gpu_id}" restart + echo "Service for GPU $gpu_id restarted." +} + +# Function to restart all GPU services +restart_all_services() { + echo "Restarting all GPU services..." + for gpu in $(seq 0 $((NUM_GPUS-1))); do + restart_service "$gpu" + done + echo "All GPU services restarted." +} + +# Function to check status of a specific GPU service +check_status() { + local gpu_id=$1 + echo "=== Status for GPU $gpu_id ===" + service "comfyui_gpu${gpu_id}" status +} + +# Function to check status of all GPU services +check_all_status() { + echo "Checking status for all GPUs..." + for gpu in $(seq 0 $((NUM_GPUS-1))); do + check_status "$gpu" + echo "----------------------------------------" + done +} + +# Function to stop a specific GPU service +stop_service() { + local gpu_id=$1 + echo "Stopping service for GPU $gpu_id..." + service "comfyui_gpu${gpu_id}" stop + echo "Service for GPU $gpu_id stopped." +} + +# Function to stop all GPU services +stop_all_services() { + echo "Stopping all GPU services..." + for gpu in $(seq 0 $((NUM_GPUS-1))); do + stop_service "$gpu" + done + echo "All GPU services stopped." +} + +# Function to start a specific GPU service +start_service() { + local gpu_id=$1 + echo "Starting service for GPU $gpu_id..." + service "comfyui_gpu${gpu_id}" start + echo "Service for GPU $gpu_id started." +} + +# Function to start all GPU services +start_all_services() { + echo "Starting all GPU services..." + for gpu in $(seq 0 $((NUM_GPUS-1))); do + start_service "$gpu" + done + echo "All GPU services started." +} + +# Show usage information +show_usage() { + echo "Usage: $0 COMMAND [GPU_ID]" + echo + echo "Commands:" + echo " logs Show logs for specific GPU" + echo " logs-all Show logs for all GPUs" + echo " restart Restart service for specific GPU" + echo " restart-all Restart all GPU services" + echo " status Check status for specific GPU" + echo " status-all Check status of all GPU services" + echo " stop Stop service for specific GPU" + echo " stop-all Stop all GPU services" + echo " start Start service for specific GPU" + echo " start-all Start all GPU services" + echo " count Show number of configured GPUs" + echo + echo "GPU_ID should be between 0 and $((NUM_GPUS-1))" +} + +# Main command handling +case "$1" in + logs) + if [ -z "$2" ]; then + echo "Error: GPU ID required" + show_usage + exit 1 + fi + if ! validate_gpu_id "$2"; then + exit 1 + fi + show_logs "$2" + ;; + logs-all) + show_all_logs + ;; + restart) + if [ -z "$2" ]; then + echo "Error: GPU ID required" + show_usage + exit 1 + fi + if ! validate_gpu_id "$2"; then + exit 1 + fi + restart_service "$2" + ;; + restart-all) + restart_all_services + ;; + status) + if [ -z "$2" ]; then + echo "Error: GPU ID required" + show_usage + exit 1 + fi + if ! validate_gpu_id "$2"; then + exit 1 + fi + check_status "$2" + ;; + status-all) + check_all_status + ;; + stop) + if [ -z "$2" ]; then + echo "Error: GPU ID required" + show_usage + exit 1 + fi + if ! validate_gpu_id "$2"; then + exit 1 + fi + stop_service "$2" + ;; + stop-all) + stop_all_services + ;; + start) + if [ -z "$2" ]; then + echo "Error: GPU ID required" + show_usage + exit 1 + fi + if ! validate_gpu_id "$2"; then + exit 1 + fi + start_service "$2" + ;; + start-all) + start_all_services + ;; + count) + echo "Configured GPUs: $NUM_GPUS" + ;; + *) + show_usage + exit 1 + ;; +esac \ No newline at end of file diff --git a/docker/scripts/start.sh b/docker/scripts/start.sh index 74c7509..eed48cb 100644 --- a/docker/scripts/start.sh +++ b/docker/scripts/start.sh @@ -7,14 +7,13 @@ CONFIG_DIR="${CONFIG_DIR:-${ROOT}/config}" COMFY_DIR="${COMFY_DIR:-${ROOT}/ComfyUI}" CONFIG_FILE="$CONFIG_DIR/config.json" -# Setup logging -LOG_FILE="${ROOT}/start.log" -mkdir -p "$(dirname "$LOG_FILE")" -touch "$LOG_FILE" - -log() { - local msg="[$(date +'%Y-%m-%d %H:%M:%S')] $*" - echo "$msg" | tee -a "$LOG_FILE" +# Setup logging for each GPU +setup_logging() { + for gpu in $(seq 0 1); do # Adjust range based on the number of GPUs + LOG_DIR="${ROOT}/comfyui_gpu${gpu}/logs" + mkdir -p "$LOG_DIR" + touch "$LOG_DIR/debug.log" "$LOG_DIR/output.log" + done } check_env_vars() { @@ -117,20 +116,53 @@ setup_comfyui() { if [ -z "$COMFY_COMMIT" ] || [ "$COMFY_COMMIT" = "null" ]; then log "WARNING: No comfy_version found, defaulting to 'main'" COMFY_COMMIT="main" - else - log "Switching ComfyUI to commit: $COMFY_COMMIT" - cd "$COMFY_DIR" - git reset --hard "$COMFY_COMMIT" fi + # Update base ComfyUI to correct commit + cd "$COMFY_DIR" + git reset --hard "$COMFY_COMMIT" + + # Get number of GPUs, allowing override with FORCE_NUM_GPUS + if [ -n "$FORCE_NUM_GPUS" ]; then + GPU_COUNT=$FORCE_NUM_GPUS + log "Using forced GPU count: $GPU_COUNT" + else + GPU_COUNT=$(python -c "import torch; print(torch.cuda.device_count())") + log "Detected $GPU_COUNT GPUs" + fi + + export NUM_GPUS=$GPU_COUNT + + # Create shared directories + mkdir -p "${ROOT}/shared/models" + mkdir -p "${ROOT}/shared/custom_nodes" + if [ -d "/workspace/shared_custom_nodes" ]; then - log "Copying and overriding shared custom nodes" - cp -rf /workspace/shared_custom_nodes/* "$COMFY_DIR/custom_nodes/" + log "Copying shared custom nodes to shared directory" + cp -rf /workspace/shared_custom_nodes/* "${ROOT}/shared/custom_nodes/" fi - # Get number of available GPUs - GPU_COUNT=$(python -c "import torch; print(torch.cuda.device_count())") - log "Found $GPU_COUNT GPUs" + # Copy ComfyUI for each GPU + for gpu in $(seq 0 $((GPU_COUNT-1))); do + GPU_DIR="${ROOT}/comfyui_gpu${gpu}" + + log "Setting up ComfyUI for GPU $gpu..." + + # Copy ComfyUI if it doesn't exist + if [ ! -d "$GPU_DIR" ]; then + log "Copying ComfyUI for GPU $gpu..." + cp -r "$COMFY_DIR" "$GPU_DIR" + + # Remove default models and custom_nodes directories + rm -rf "$GPU_DIR/models" + rm -rf "$GPU_DIR/custom_nodes" + fi + + # Create symlinks for shared resources + log "Creating symlinks for GPU $gpu..." + ln -sfn "${ROOT}/shared/models" "$GPU_DIR/models" + ln -sfn "${ROOT}/shared/custom_nodes" "$GPU_DIR/custom_nodes" + done if [ "$GPU_COUNT" -eq 0 ]; then log "No GPUs found - setting up CPU instance" @@ -172,34 +204,17 @@ export PORT=${port}" "$service_file" } start_comfyui() { - # Get number of available GPUs - GPU_COUNT=$(python -c "import torch; print(torch.cuda.device_count())") + log "Starting ComfyUI services..." - if [ "$GPU_COUNT" -eq 0 ]; then - log "Starting CPU instance" - service "comfyui_cpu" start - log "Started ComfyUI CPU service on port 8188" - else - # Start each GPU service - for gpu in $(seq 0 $((GPU_COUNT-1))); do - port=$((8188 + gpu)) - service_name="comfyui_gpu${gpu}" - - service "$service_name" start - log "Started ComfyUI service for GPU $gpu on port $port" - done - fi -} -# Function to download models - -normalize_path() { - local path="$1" - # Only normalize if it's not a URL (doesn't start with http:// or https://) - if [[ "$path" != http://* ]] && [[ "$path" != https://* ]]; then - echo "${path//\/\//\/}" - else - echo "$path" + # Start all GPU instances using mgpu + mgpu start-all + + if [ $? -ne 0 ]; then + log "Failed to start ComfyUI services" + return 1 fi + + log "All ComfyUI services started successfully" } download_model() { @@ -226,38 +241,26 @@ download_model() { log "Using download path: $primary_path" - local target_dir="$(normalize_path "$COMFY_DIR/$primary_path")" + local target_dir="$(normalize_path "${ROOT}/shared/$primary_path")" local target_file="$(normalize_path "$target_dir/$filename")" mkdir -p "$target_dir" - # First, try using Python script for Hugging Face download - if [[ "$url" == *"huggingface.co"* ]]; then - python "${ROOT}/scripts/download_model.py" \ - "$url" "$target_dir" "$filename" >> "$LOG_FILE" 2>&1 - - # Check download result - if [ $? -eq 0 ] && [ -s "$target_file" ]; then - log "Successfully downloaded $url using Python script" - return 0 - fi - fi + # Download using wget + log "Downloading from: $url" + log "Saving to: $target_file" - # If Python script fails or not a HF URL, try wget wget \ --progress=bar:force:noscroll \ - --no-verbose \ - -q \ -O "$target_file" \ "$url" >> "$LOG_FILE" 2>&1 # Check wget result if [ $? -eq 0 ] && [ -s "$target_file" ]; then - log "Successfully downloaded $url using wget" + log "Successfully downloaded $url" return 0 fi - # If both methods fail log "Failed to download $url" return 1 } @@ -268,23 +271,6 @@ download_models() { return 0 fi - HF_HUB_ENABLE_HF_TRANSFER=1 - - # Try to import huggingface_hub - if python -c "import huggingface_hub" 2>/dev/null; then - # Login to Hugging Face using environment variable - if [ -n "$HF_TOKEN" ]; then - log "Logging into Hugging Face" - python -c "from huggingface_hub import login; login(token='$HF_TOKEN')" - fi - else - log "huggingface_hub not found. Falling back to wget downloads." - return 1 - fi - - local models_dir="/home/ubuntu/ComfyUI/models/checkpoints" - mkdir -p "$models_dir" - # Process each model in the configuration local models models=$(jq -c '.models[] | select(.url != null)' "$CONFIG_FILE") @@ -320,9 +306,10 @@ install_nodes() { if [ ! -f "$CONFIG_FILE" ]; then log "Config not found: $CONFIG_FILE" return - fi + } - cd "$COMFY_DIR/custom_nodes" + # Use shared custom_nodes directory + cd "${ROOT}/shared/custom_nodes" while IFS= read -r node; do name=$(echo "$node" | jq -r '.name') @@ -393,92 +380,39 @@ start_nginx() { service nginx start } -monitor_services() { - # Get number of GPUs - GPU_COUNT=$(python -c "import torch; print(torch.cuda.device_count())") - RESTART_ATTEMPTS=0 - MAX_RESTART_ATTEMPTS=5 - COMFYUI_DEBUG_LOG="$ROOT/.comfyui/log/debug.log" - - while true; do - if [ "$GPU_COUNT" -eq 0 ]; then - # Monitor CPU instance - if ! service comfyui_cpu status >/dev/null 2>&1; then - log "ComfyUI CPU service stopped unexpectedly" - log "=== ComfyUI Debug Log ===" - tail -n 50 "$COMFYUI_DEBUG_LOG" - - if [ $RESTART_ATTEMPTS -lt $MAX_RESTART_ATTEMPTS ]; then - log "Attempting to restart ComfyUI CPU service (Attempt $((RESTART_ATTEMPTS+1))/$MAX_RESTART_ATTEMPTS)..." - service comfyui_cpu start - RESTART_ATTEMPTS=$((RESTART_ATTEMPTS+1)) - else - log "MAX RESTART ATTEMPTS REACHED. Giving up on ComfyUI CPU service." - break - fi - fi - else - # Monitor each GPU instance - for gpu in $(seq 0 $((GPU_COUNT-1))); do - service_name="comfyui_gpu${gpu}" - if ! service "$service_name" status >/dev/null 2>&1; then - log "ComfyUI GPU $gpu service stopped unexpectedly" - log "=== ComfyUI Debug Log for GPU $gpu ===" - tail -n 50 "$COMFYUI_DEBUG_LOG" - - if [ $RESTART_ATTEMPTS -lt $MAX_RESTART_ATTEMPTS ]; then - log "Attempting to restart ComfyUI GPU $gpu service (Attempt $((RESTART_ATTEMPTS+1))/$MAX_RESTART_ATTEMPTS)..." - service "$service_name" start - RESTART_ATTEMPTS=$((RESTART_ATTEMPTS+1)) - else - log "MAX RESTART ATTEMPTS REACHED. Giving up on ComfyUI GPU $gpu service." - break 2 # Break out of both loops - fi - fi - done - fi - - # Reset restart attempts if all services are running - RESTART_ATTEMPTS=0 - sleep 30 - done -} - main() { - log "Starting setup process..." - + log "Starting initialization..." + + # Setup logging + setup_logging + # Check environment variables - check_env_vars - - # SSH Key Setup - setup_ssh_access - - # Initial setup - setup_nginx - setup_comfyui - - # Process config and install components - if [ -f "$CONFIG_FILE" ]; then - install_nodes - download_models - else - log "No config found at: $CONFIG_FILE" - log "Continuing with default configuration..." - fi - - # Setup and start services - setup_services - start_nginx - - # Start ComfyUI service - log "Starting ComfyUI service..." - start_comfyui - - # Monitor services - monitor_services - - log "SLEEPING..." - + check_env_vars || exit 1 + + # Setup SSH access + setup_ssh_access || exit 1 + + # Setup nginx + setup_nginx || exit 1 + + # Setup ComfyUI instances + setup_comfyui || exit 1 + + # Download models if specified in config + download_models || exit 1 + + # Install custom nodes if specified in config + install_nodes || exit 1 + + # Start ComfyUI services + start_comfyui || exit 1 + + # Start nginx + start_nginx || exit 1 + + log "Initialization complete" + + # Keep container running sleep infinity }