lib/clacky/server/server_master.rb



# frozen_string_literal: true

require "socket"
require "tmpdir"
require_relative "../banner"
require_relative "../version"

module Clacky
  module Server
    # Master process — owns the listen socket, spawns/monitors worker processes.
    #
    # Lifecycle:
    #   clacky server
    #     └─ Master.run  (this file)
    #           ├─ creates TCPServer, holds it forever
    #           ├─ spawns Worker via spawn() — full new Ruby process, loads fresh gem
    #           ├─ traps USR1 → hot_restart (spawn new worker, gracefully stop old)
    #           └─ traps TERM/INT → shutdown (stop worker, exit cleanly)
    #
    # Worker receives:
    #   CLACKY_WORKER=1          — "I am a worker, start HttpServer directly"
    #   CLACKY_INHERIT_FD=<n>   — file descriptor number of the inherited TCPServer socket
    #   CLACKY_MASTER_PID=<n>   — master PID so worker can send USR1 back on upgrade
    class Master
      # Worker exits with this code to request a hot restart (e.g. after gem upgrade).
      RESTART_EXIT_CODE        = 75
      MAX_CONSECUTIVE_FAILURES = 5

      # How long (seconds) to wait for a new worker to become ready before killing the old one.
      NEW_WORKER_BOOT_WAIT = 3

      def initialize(host:, port:, argv: nil, extra_flags: [])
        @host   = host
        @port   = port
        @argv   = argv          # kept for backward compat but no longer used
        @extra_flags = extra_flags  # e.g. ["--brand-test"]

        @socket     = nil
        @worker_pid = nil
        @restart_requested = false
        @shutdown_requested = false
      end

      def run
        # 0. Kill any existing master on this port before binding.
        kill_existing_master

        # 1. Try to bind the socket.
        # If port is 7070 (default), try fallback ports 7071-7075 if occupied.
        # If port is non-default (user-specified), only try that exact port.
        original_port = @port
        max_port = (@port == 7070) ? (@port + 5) : @port
        @socket = bind_with_fallback(@host, @port, max_port: max_port)
        
        if @socket.nil?
          if @port == 7070
            Clacky::Logger.error("[Master] No available ports in range 7070-7075")
          else
            Clacky::Logger.error("[Master] Port #{@port} is in use")
          end
          exit(1)
        end
        
        @socket.setsockopt(Socket::SOL_SOCKET, Socket::SO_REUSEADDR, true)
        @port = @socket.local_address.ip_port  # Update to actual bound port

        # 2. Print banner after port is determined
        print_banner(port_changed: @port != original_port, original_port: original_port)

        write_pid_file

        # 3. Signal handlers
        Signal.trap("USR1") { @restart_requested  = true }
        Signal.trap("TERM") { @shutdown_requested = true }
        Signal.trap("INT")  { @shutdown_requested = true }
        Signal.trap("HUP")  { @shutdown_requested = true }

        # 4. Spawn first worker
        @worker_pid = spawn_worker
        @consecutive_failures = 0

        # 4. Monitor loop
        loop do
          if @shutdown_requested
            shutdown
            break
          end

          if @restart_requested
            @restart_requested = false
            hot_restart
            @consecutive_failures = 0
          end

          # Non-blocking wait: check if worker has exited
          pid, status = Process.waitpid2(@worker_pid, Process::WNOHANG)
          if pid
            exit_code = status.exitstatus
            if exit_code == RESTART_EXIT_CODE
              Clacky::Logger.info("[Master] Worker requested restart (exit #{RESTART_EXIT_CODE}).")
              @worker_pid = spawn_worker
              @consecutive_failures = 0
            elsif @shutdown_requested
              break
            else
              @consecutive_failures += 1
              if @consecutive_failures >= MAX_CONSECUTIVE_FAILURES
                Clacky::Logger.error("[Master] Worker failed #{MAX_CONSECUTIVE_FAILURES} times in a row, giving up.")
                shutdown
                break
              end
              delay = [0.5 * (2 ** (@consecutive_failures - 1)), 30].min  # exponential backoff, max 30s
              Clacky::Logger.warn("[Master] Worker exited unexpectedly (exit #{exit_code}), failure #{@consecutive_failures}/#{MAX_CONSECUTIVE_FAILURES}, restarting in #{delay}s...")
              sleep delay
              @worker_pid = spawn_worker
            end
          end

          sleep 0.1
        end
      ensure
        remove_pid_file
      end


      # Spawn a fresh Ruby process that loads the (possibly updated) gem from disk.
      # The listen socket is inherited via its file descriptor number.
      def spawn_worker
        env = {
          "CLACKY_WORKER"      => "1",
          "CLACKY_INHERIT_FD"  => @socket.fileno.to_s,
          "CLACKY_MASTER_PID"  => Process.pid.to_s
        }
        # Keep the socket fd open across exec — mark it as non-CLOEXEC.
        @socket.close_on_exec = false

        # Reconstruct the worker command explicitly.
        # We cannot rely on ARGV (Thor has already consumed it), so we rebuild
        # the minimal args: `clacky server --host HOST --port PORT [extra_flags]`
        ruby   = RbConfig.ruby
        script = File.expand_path($0)
        worker_argv = ["server", "--host", @host.to_s, "--port", @port.to_s] + @extra_flags

        Clacky::Logger.info("[Master PID=#{Process.pid}] spawn: #{ruby} #{script} #{worker_argv.join(' ')}")
        Clacky::Logger.info("[Master PID=#{Process.pid}] env: #{env.inspect}")

        # pgroup: 0 puts worker in its own process group.
        # This lets Master send TERM/KILL to the entire group (-pid) on shutdown,
        # ensuring grandchildren (e.g. chrome-devtools-mcp node process) are also
        # cleaned up even if the worker is force-killed before its shutdown_proc runs.
        #
        # NOTE on stdio: we deliberately let the worker inherit Master's fd 0/1/2
        # so users see startup banner / request logs in their terminal. Protection
        # against Errno::EPIPE on broken parent stdout is installed inside the
        # worker itself (see cli.rb worker entry — EPIPESafeIO wrapper).
        pid = spawn(env, ruby, script, *worker_argv, pgroup: 0)
        Clacky::Logger.info("[Master PID=#{Process.pid}] Spawned worker PID=#{pid} pgroup=#{pid}")
        pid
      end

      # Spawn a new worker, wait for it to boot, then gracefully stop the old one.
      def hot_restart
        old_pid = @worker_pid
        Clacky::Logger.info("[Master] Hot restart: spawning new worker (old PID=#{old_pid})...")

        new_pid = spawn_worker
        @worker_pid = new_pid

        # Give the new worker time to bind and start serving
        sleep NEW_WORKER_BOOT_WAIT

        # Gracefully stop old worker — TERM the whole process group first so
        # grandchildren (node MCP, etc.) also get a chance to shut down cleanly.
        begin
          Process.kill("TERM", -old_pid)
          # Reap it (non-blocking loop so we don't block the monitor)
          deadline = Time.now + 5
          loop do
            pid, = Process.waitpid2(old_pid, Process::WNOHANG)
            break if pid
            break if Time.now > deadline
            sleep 0.1
          end
          Process.kill("KILL", -old_pid) rescue nil  # force-kill entire group if still alive
        rescue Errno::ESRCH
          # already gone — fine
        end

        Clacky::Logger.info("[Master] Hot restart complete. New worker PID=#{new_pid}")
      end

      def shutdown
        Clacky::Logger.info("[Master] Shutting down (worker PID=#{@worker_pid})...")
        if @worker_pid
          begin
            # TERM the entire worker process group so grandchildren (node MCP, etc.)
            # are also signalled and can clean up before we force-kill.
            Process.kill("TERM", -@worker_pid)
            # Wait up to 2s for worker graceful exit, then KILL the whole group
            deadline = Time.now + 3
            loop do
              pid, = Process.waitpid2(@worker_pid, Process::WNOHANG)
              break if pid
              if Time.now > deadline
                Clacky::Logger.warn("[Master] Worker did not exit in time, sending KILL...")
                Process.kill("KILL", -@worker_pid) rescue nil
                break
              end
              sleep 0.1
            end
          rescue Errno::ESRCH, Errno::ECHILD
            # already gone
          end
        end
        @socket.close rescue nil
        Clacky::Logger.info("[Master] Exited.")
        exit(0)
      end

      def pid_file_path
        File.join(Dir.tmpdir, "clacky-master-#{@port}.pid")
      end

      def write_pid_file
        File.write(pid_file_path, Process.pid.to_s)
      end

      def remove_pid_file
        File.delete(pid_file_path) if File.exist?(pid_file_path)
      end

      def port_free_within?(seconds)
        deadline = Time.now + seconds
        loop do
          begin
            TCPServer.new(@host, @port).close
            return true
          rescue Errno::EADDRINUSE
            return false if Time.now > deadline
            sleep 0.1
          end
        end
      end

      # Try to bind to preferred_port, fall back to next ports if occupied.
      # Returns the bound TCPServer, or nil if all ports in range are occupied.
      def bind_with_fallback(host, preferred_port, max_port:)
        (preferred_port..max_port).each do |port|
          begin
            server = TCPServer.new(host, port)
            Clacky::Logger.info("[Master] Bound to port #{port}") if port != preferred_port
            return server
          rescue Errno::EADDRINUSE
            next
          end
        end
        nil
      end

      def print_banner(port_changed: false, original_port: nil)
        banner = Clacky::Banner.new
        puts ""
        puts banner.colored_cli_logo
        puts banner.colored_tagline
        puts ""
        
        if port_changed
          puts "   [!] Port #{original_port} is in use, using #{@port} instead"
          puts ""
        end
        
        puts "   Web UI: #{banner.highlight("http://#{@host}:#{@port}")}"
        puts "   Version: #{Clacky::VERSION}"
        puts "   Press Ctrl-C to stop."
        puts ""
      end

      # Scan all fallback port PID files to prevent duplicate masters
      # when a previous instance bound to a non-default fallback port.
      def kill_existing_master
        max_port = (@port == 7070) ? (@port + 5) : @port
        (@port..max_port).each do |port|
          kill_master_on_port(port)
        end
      end

      private def kill_master_on_port(port)
        path = File.join(Dir.tmpdir, "clacky-master-#{port}.pid")
        return unless File.exist?(path)

        pid = File.read(path).strip.to_i
        if pid <= 0
          File.delete(path) rescue nil
          return
        end

        begin
          Process.kill("TERM", pid)
          Clacky::Logger.info("[Master] Sent TERM to existing master (PID=#{pid}, port=#{port}), waiting...")

          deadline = Time.now + 5
          until process_dead?(pid) || Time.now > deadline
            sleep 0.1
          end

          unless process_dead?(pid)
            Clacky::Logger.warn("[Master] PID=#{pid} still alive after 5s, sending KILL...")
            Process.kill("KILL", pid) rescue Errno::ESRCH
          end

          Clacky::Logger.info("[Master] Existing master PID=#{pid} (port=#{port}) stopped.")
        rescue Errno::ESRCH
          Clacky::Logger.info("[Master] Existing master PID=#{pid} already gone.")
        rescue Errno::EPERM
          Clacky::Logger.warn("[Master] Could not stop existing master (PID=#{pid}) — permission denied.")
        ensure
          File.delete(path) if File.exist?(path)
        end
      end

      private def process_dead?(pid)
        Process.kill(0, pid)
        false
      rescue Errno::ESRCH
        true
      rescue Errno::EPERM
        false
      end
    end
  end
end