lib/clacky/utils/file_processor.rb



# frozen_string_literal: true

require "tmpdir"
require "fileutils"
require "securerandom"
require "stringio"

require_relative "parser_manager"
require "zip"

module Clacky
  module Utils
  # File processing pipeline.
  #
  # Two entry points:
  #   FileProcessor.save(body:, filename:)
  #     → Store raw bytes to disk only. Returns { name:, path: }.
  #       Used by http_server and channel adapters — no parsing here.
  #
  #   FileProcessor.process_path(path, name: nil)
  #     → Parse an already-saved file. Returns FileRef (with preview_path or parse_error).
  #       Used by agent.run when building the file prompt.
  #
  # (FileProcessor.process = save + process_path in one call, for convenience.)
  module FileProcessor
    UPLOAD_DIR      = File.join(Dir.tmpdir, "clacky-uploads").freeze
    MAX_FILE_BYTES  = 32 * 1024 * 1024  # 32 MB
    MAX_IMAGE_BYTES = 5 * 1024 * 1024    # 5 MB

    # Alias used by FileReader tool
    MAX_FILE_SIZE = MAX_FILE_BYTES

    # Images wider than this will be downscaled before sending to LLM (pixels)
    IMAGE_MAX_WIDTH = 800
    # Hard limit for images that can't be resized: Anthropic/Bedrock vision API supports up to 5MB
    IMAGE_MAX_BASE64_BYTES = 5_000_000

    BINARY_EXTENSIONS = %w[
      .png .jpg .jpeg .gif .webp .bmp .tiff .ico .svg
      .pdf
      .zip .gz .tgz .tar .rar .7z
      .exe .dll .so .dylib
      .mp3 .mp4 .avi .mov .mkv .wav .flac
      .ttf .otf .woff .woff2
      .db .sqlite .bin .dat
    ].freeze

    GLOB_ALLOWED_BINARY_EXTENSIONS = %w[
      .pdf .doc .docx .ppt .pptx .xls .xlsx .odt .odp .ods
    ].freeze

    LLM_BINARY_EXTENSIONS = %w[.png .jpg .jpeg .gif .webp .pdf].freeze

    MIME_TYPES = {
      ".png"  => "image/png",
      ".jpg"  => "image/jpeg",
      ".jpeg" => "image/jpeg",
      ".gif"  => "image/gif",
      ".webp" => "image/webp",
      ".pdf"  => "application/pdf"
    }.freeze

    FILE_TYPES = {
      ".docx" => :document,  ".doc"  => :document,
      ".xlsx" => :spreadsheet, ".xls" => :spreadsheet,
      ".pptx" => :presentation, ".ppt" => :presentation,
      ".pdf"  => :pdf,
      ".zip"  => :zip, ".gz" => :zip, ".tgz" => :zip, ".tar" => :zip, ".rar" => :zip, ".7z" => :zip,
      ".png"  => :image, ".jpg" => :image, ".jpeg" => :image,
      ".gif"  => :image, ".webp" => :image,
      ".csv"  => :csv,
      ".md"   => :text, ".markdown" => :text, ".txt" => :text, ".log" => :text
    }.freeze

    # Plain-text extensions whose raw content can be embedded directly as the
    # preview (no external parser needed). Kept conservative to avoid pulling
    # in huge source files by mistake.
    TEXT_PREVIEW_EXTENSIONS = %w[.md .markdown .txt .log].freeze

    # FileRef: result of process / process_path.
    FileRef = Struct.new(:name, :type, :original_path, :preview_path, :parse_error, :parser_path, keyword_init: true) do
      def parse_failed?
        preview_path.nil? && !parse_error.nil?
      end
    end

    # ---------------------------------------------------------------------------
    # Public API
    # ---------------------------------------------------------------------------

    # Store raw bytes to disk — no parsing.
    # Used by http_server upload endpoint and channel adapters.
    #
    # @return [Hash] { name: String, path: String }
    def self.save(body:, filename:)
      FileUtils.mkdir_p(UPLOAD_DIR)
      safe_name = sanitize_filename(filename)
      dest      = File.join(UPLOAD_DIR, "#{SecureRandom.hex(8)}_#{safe_name}")
      File.binwrite(dest, body)
      { name: safe_name, path: dest }
    end

    # Parse an already-saved file and return a FileRef.
    # Called by agent.run for each disk file before building the prompt.
    #
    # @param path [String] Path to the file on disk
    # @param name [String] Display name (defaults to basename)
    # @return [FileRef]
    def self.process_path(path, name: nil)
      name ||= File.basename(path.to_s)
      # Use compound extension for .tar.gz so it's treated as a tarball, not gzip.
      basename_lower = name.to_s.downcase
      ext =
        if basename_lower.end_with?(".tar.gz")
          ".tar.gz"
        else
          File.extname(path.to_s).downcase
        end
      type  = FILE_TYPES[ext] || :file

      case ext
      when ".zip"
        body            = File.binread(path)
        preview_content = parse_zip_listing(body)
        preview_path    = save_preview(preview_content, path)
        FileRef.new(name: name, type: :zip, original_path: path, preview_path: preview_path)

      when ".tar", ".tar.gz", ".tgz", ".gz"
        # Archive listing for tarballs and gzip'd files. Provides the LLM a
        # file-tree preview so it can decide whether to ask the user to
        # extract them (via the shell tool).
        begin
          preview_content = parse_tar_listing(path, ext)
          preview_path    = save_preview(preview_content, path)
          FileRef.new(name: name, type: :zip, original_path: path, preview_path: preview_path)
        rescue => e
          FileRef.new(name: name, type: :zip, original_path: path, parse_error: e.message)
        end

      when ".png", ".jpg", ".jpeg", ".gif", ".webp"
        FileRef.new(name: name, type: :image, original_path: path)

      when ".csv"
        # CSV is plain text — the file itself IS the preview. No parser, no copy.
        # FileReader handles encoding fallback via safe_utf8 when it reads the file.
        FileRef.new(name: name, type: :csv, original_path: path, preview_path: path)

      when *TEXT_PREVIEW_EXTENSIONS
        # Markdown / plain text / log: the file itself IS the preview.
        # No parser needed, no tmpdir copy — just point preview_path at the original.
        FileRef.new(name: name, type: :text, original_path: path, preview_path: path)

      else
        result = Utils::ParserManager.parse(path)
        if result[:success]
          preview_path = save_preview(result[:text], path)
          FileRef.new(name: name, type: type, original_path: path, preview_path: preview_path)
        else
          FileRef.new(name: name, type: type, original_path: path,
                      parse_error: result[:error], parser_path: result[:parser_path])
        end
      end
    end

    # Save + parse in one call (convenience method).
    #
    # @return [FileRef]
    def self.process(body:, filename:)
      saved = save(body: body, filename: filename)
      process_path(saved[:path], name: saved[:name])
    end

    # Save raw image bytes to disk and return a FileRef.
    # Used by agent when an image exceeds MAX_IMAGE_BYTES and must be downgraded to disk.
    def self.save_image_to_disk(body:, mime_type:, filename: "image.jpg")
      FileUtils.mkdir_p(UPLOAD_DIR)
      safe_name = sanitize_filename(filename)
      dest      = File.join(UPLOAD_DIR, "#{SecureRandom.hex(8)}_#{safe_name}")
      File.binwrite(dest, body)
      FileRef.new(name: safe_name, type: :image, original_path: dest)
    end

    # ---------------------------------------------------------------------------
    # File type helpers (used by tools and agent)
    # ---------------------------------------------------------------------------

    def self.binary_file_path?(path)
      ext = File.extname(path).downcase
      return true if BINARY_EXTENSIONS.include?(ext)
      File.binread(path, 512).to_s.include?("\x00")
    rescue
      false
    end

    def self.glob_allowed_binary?(path)
      GLOB_ALLOWED_BINARY_EXTENSIONS.include?(File.extname(path).downcase)
    end

    def self.supported_binary_file?(path)
      LLM_BINARY_EXTENSIONS.include?(File.extname(path).downcase)
    end

    def self.detect_mime_type(path, _data = nil)
      MIME_TYPES[File.extname(path).downcase] || "application/octet-stream"
    end

    # Downscale a base64-encoded image so its width is at most max_width pixels.
    #
    # Strategy:
    #   PNG  → chunky_png (pure Ruby, always available as gem dependency)
    #   other formats (JPG/WEBP/GIF) → sips on macOS, `convert` (ImageMagick) on Linux
    #   fallback (no CLI tool) → return as-is, but raise if larger than IMAGE_MAX_BASE64_BYTES
    #
    # @param b64       [String]  base64-encoded image data
    # @param mime_type [String]  e.g. "image/png", "image/jpeg", "image/webp"
    # @param max_width [Integer] maximum output width in pixels (default: IMAGE_MAX_WIDTH)
    # @return [String] base64-encoded (possibly downscaled) image data
    def self.downscale_image_base64(b64, mime_type, max_width: IMAGE_MAX_WIDTH)
      require "base64"

      result = if mime_type == "image/png"
                 downscale_png_chunky(b64, max_width)
               else
                 downscale_via_cli(b64, mime_type, max_width)
               end

      return result if result

      # No resize tool available — enforce API hard size limit (5MB)
      if b64.bytesize > IMAGE_MAX_BASE64_BYTES
        size_kb = b64.bytesize / 1024
        limit_mb = IMAGE_MAX_BASE64_BYTES / 1_000_000
        raise ArgumentError,
          "Image too large to send (#{size_kb}KB > #{limit_mb}MB). " \
          "Install ImageMagick (`brew install imagemagick`) to enable automatic resizing."
      end
      b64
    end

    def self.file_to_base64(path)
      require "base64"
      ext  = File.extname(path).downcase
      size = File.size(path)
      raise ArgumentError, "File too large: #{path}" if size > MAX_FILE_BYTES
      ext_mime = MIME_TYPES[ext] || "application/octet-stream"
      raw_data = File.binread(path)
      # Detect actual image format from magic bytes (ignore misleading extensions)
      mime = ext_mime.start_with?("image/") ? detect_image_mime_type(raw_data, ext_mime) : ext_mime
      data = Base64.strict_encode64(raw_data)
      # Downscale images before sending to LLM to reduce token cost
      data = downscale_image_base64(data, mime) if mime.start_with?("image/")
      { format: ext[1..], mime_type: mime, size_bytes: size, base64_data: data }
    end

    def self.image_path_to_data_url(path)
      raise ArgumentError, "Image file not found: #{path}" unless File.exist?(path)
      size = File.size(path)
      if size > MAX_IMAGE_BYTES
        raise ArgumentError, "Image too large (#{size / 1024}KB > #{MAX_IMAGE_BYTES / 1024}KB): #{path}"
      end
      require "base64"
      # Extension-based guess as fallback only
      ext  = File.extname(path).downcase.delete(".")
      ext_mime = case ext
                 when "jpg", "jpeg" then "image/jpeg"
                 when "png"         then "image/png"
                 when "gif"         then "image/gif"
                 when "webp"        then "image/webp"
                 else "image/#{ext}"
                 end
      raw_data = File.binread(path)
      # Detect actual image format from magic bytes (ignore misleading extensions)
      mime = detect_image_mime_type(raw_data, ext_mime)
      b64 = Base64.strict_encode64(raw_data)
      # Downscale images before sending to LLM to reduce token cost
      b64 = downscale_image_base64(b64, mime)
      "data:#{mime};base64,#{b64}"
    end

    # ---------------------------------------------------------------------------
    # Private helpers
    # ---------------------------------------------------------------------------

    def self.parse_zip_listing(body)
      lines = ["# ZIP Contents\n"]
      Zip::InputStream.open(StringIO.new(body)) do |zis|
        while (entry = zis.get_next_entry)
          size = entry.size ? " (#{entry.size} bytes)" : ""
          lines << "- #{entry.name}#{size}"
        end
      end
      lines.join("\n")
    rescue => e
      "# ZIP Contents\n(could not list entries: #{e.message})"
    end

    # List entries in a tarball or gzip file.
    #
    # Handles:
    #   .tar        → raw tar reader
    #   .tar.gz/.tgz → gunzip stream + tar reader
    #   .gz         → single gzipped file → show original filename + uncompressed size
    def self.parse_tar_listing(path, ext)
      require "rubygems/package"
      require "zlib"

      case ext
      when ".tar"
        lines = ["# TAR Contents\n"]
        File.open(path, "rb") do |file|
          Gem::Package::TarReader.new(file) do |tar|
            tar.each do |entry|
              kind = entry.directory? ? "[dir] " : ""
              size = entry.header.size ? " (#{entry.header.size} bytes)" : ""
              lines << "- #{kind}#{entry.full_name}#{size}"
            end
          end
        end
        lines.join("\n")

      when ".tar.gz", ".tgz"
        lines = ["# TAR.GZ Contents\n"]
        File.open(path, "rb") do |file|
          Zlib::GzipReader.wrap(file) do |gz|
            Gem::Package::TarReader.new(gz) do |tar|
              tar.each do |entry|
                kind = entry.directory? ? "[dir] " : ""
                size = entry.header.size ? " (#{entry.header.size} bytes)" : ""
                lines << "- #{kind}#{entry.full_name}#{size}"
              end
            end
          end
        end
        lines.join("\n")

      when ".gz"
        # Could be gzipped-tar with a misleading extension, or a single-file gzip.
        # Try tar first; on failure, fall back to single-file metadata.
        begin
          lines = ["# TAR.GZ Contents\n"]
          found_tar = false
          File.open(path, "rb") do |file|
            Zlib::GzipReader.wrap(file) do |gz|
              Gem::Package::TarReader.new(gz) do |tar|
                tar.each do |entry|
                  found_tar = true
                  kind = entry.directory? ? "[dir] " : ""
                  size = entry.header.size ? " (#{entry.header.size} bytes)" : ""
                  lines << "- #{kind}#{entry.full_name}#{size}"
                end
              end
            end
          end
          return lines.join("\n") if found_tar
        rescue StandardError
          # fall through to single-file gzip handling
        end

        # Single-file gzip: report the original filename (if recorded) and compressed/uncompressed sizes.
        original_name = nil
        uncompressed  = nil
        File.open(path, "rb") do |file|
          Zlib::GzipReader.wrap(file) do |gz|
            original_name = gz.orig_name
            # Read fully to get the uncompressed size. Guarded: stop after 64MB
            # to avoid blowing memory on pathological inputs — the preview only
            # needs a size estimate, not the content.
            limit   = 64 * 1024 * 1024
            total   = 0
            while (chunk = gz.read(1024 * 1024))
              total += chunk.bytesize
              break if total > limit
            end
            uncompressed = total
          end
        end
        lines = ["# GZIP Contents\n"]
        lines << "- Original filename: #{original_name || "(not recorded)"}"
        lines << "- Compressed size:   #{File.size(path)} bytes"
        lines << "- Uncompressed size: #{uncompressed} bytes#{uncompressed && uncompressed > 64 * 1024 * 1024 ? " (truncated)" : ""}"
        lines.join("\n")
      end
    rescue => e
      "# Archive Contents\n(could not list entries: #{e.message})"
    end

    def self.save_preview(content, original_path)
      # Always write previews to a tmpdir-based path to avoid polluting the
      # user's working directory with .preview.md sidecar files.
      # Use the same UPLOAD_DIR that uploaded files live in; for on-disk files
      # outside that dir (e.g. project files opened by file_reader), we still
      # land in UPLOAD_DIR so the user's tree stays clean.
      FileUtils.mkdir_p(UPLOAD_DIR)
      safe_name = File.basename(original_path.to_s).gsub(/[\/\:\*?"<>|\x00]/, "_")
      dest = File.join(UPLOAD_DIR, "#{SecureRandom.hex(8)}_#{safe_name}.preview.md")
      File.write(dest, content)
      dest
    end

    def self.sanitize_filename(name)
      # Keep Unicode letters/digits (including CJK), ASCII word chars, dots, hyphens, spaces.
      # Only strip characters that are unsafe on common filesystems: / \ : * ? " < > | \0
      # to_utf8 first: HTTP multipart headers arrive as ASCII-8BIT on Ruby 2.6,
      # and regex matching against ASCII-8BIT raises "invalid byte sequence in UTF-8".
      base = File.basename(Clacky::Utils::Encoding.to_utf8(name.to_s))
               .gsub(/[\/\\:\*?"<>|\x00]/, '_')
               .strip
      base.empty? ? 'upload' : base
    end

    # Detect the actual image MIME type from raw binary data by inspecting
    # magic bytes, ignoring the file extension. Falls back to extension-based
    # detection when magic bytes don't match any known format.
    #
    # Handles: PNG, JPEG, GIF, WEBP, BMP, TIFF
    #
    # @param data [String] raw binary data (first 12 bytes is sufficient)
    # @param fallback_mime [String] MIME type from extension, used as fallback
    # @return [String] detected MIME type (e.g. "image/png", "image/jpeg")
    def self.detect_image_mime_type(data, fallback_mime = "image/png")
      return fallback_mime if data.nil? || data.bytesize < 4

      bytes = data.bytes

      case
      # PNG: \x89 P N G \r \n \x1a \n
      when bytes[0] == 0x89 && bytes[1] == 0x50 && bytes[2] == 0x4E && bytes[3] == 0x47
        "image/png"
      # JPEG: \xFF \xD8 \xFF
      when bytes[0] == 0xFF && bytes[1] == 0xD8 && bytes[2] == 0xFF
        "image/jpeg"
      # GIF: GIF87a or GIF89a
      when bytes[0] == 0x47 && bytes[1] == 0x49 && bytes[2] == 0x46 && bytes[3] == 0x38
        "image/gif"
      # WEBP: RIFF .... WEBP
      when bytes[0] == 0x52 && bytes[1] == 0x49 && bytes[2] == 0x46 && bytes[3] == 0x46 &&
           data.bytesize >= 12 && data[8, 4] == "WEBP"
        "image/webp"
      # BMP: BM
      when bytes[0] == 0x42 && bytes[1] == 0x4D
        "image/bmp"
      # TIFF: II*\x00 (little-endian) or MM\x00* (big-endian)
      when (bytes[0] == 0x49 && bytes[1] == 0x49 && bytes[2] == 0x2A && bytes[3] == 0x00) ||
           (bytes[0] == 0x4D && bytes[1] == 0x4D && bytes[2] == 0x00 && bytes[3] == 0x2A)
        "image/tiff"
      else
        fallback_mime
      end
    end

    # ---------------------------------------------------------------------------
    # Image downscale helpers (private)
    # ---------------------------------------------------------------------------

    # Downscale a PNG using chunky_png (pure Ruby — always available).
    # Returns downscaled base64, or original base64 if already within max_width.
    def self.downscale_png_chunky(b64, max_width)
      require "chunky_png"
      require "base64"
      image = ChunkyPNG::Image.from_blob(Base64.strict_decode64(b64))
      return b64 if image.width <= max_width

      src_w, src_h = image.width, image.height
      dst_h = (src_h * max_width.to_f / src_w).round
      image.resample_nearest_neighbor!(max_width, dst_h)
      before_kb = b64.bytesize / 1024
      result    = Base64.strict_encode64(image.to_blob)
      after_kb  = result.bytesize / 1024
      Clacky::Logger.debug("image_downscaled",
        format: "png",
        from: "#{src_w}x#{src_h} (#{before_kb}KB)",
        to:   "#{max_width}x#{dst_h} (#{after_kb}KB)")
      result
    rescue => e
      Clacky::Logger.debug("image_downscale_skipped", format: "png", reason: e.message)
      nil
    end

    # Downscale a non-PNG image using CLI tools:
    #   macOS → sips (built-in, no extra deps)
    #   Linux → convert (ImageMagick, must be installed)
    # Returns downscaled base64, or nil if no tool is available.
    def self.downscale_via_cli(b64, mime_type, max_width)
      require "base64"
      require "tmpdir"

      ext = mime_type.split("/").last
      ext = "jpg" if ext == "jpeg"

      # Write input to a temp file
      Dir.mktmpdir("clacky-img") do |dir|
        input  = File.join(dir, "input.#{ext}")
        output = File.join(dir, "output.#{ext}")
        File.binwrite(input, Base64.strict_decode64(b64))

        before_kb = b64.bytesize / 1024
        success = false

        if RUBY_PLATFORM.include?("darwin")
          # macOS: sips is always available
          success = system("sips", "-Z", max_width.to_s, input, "--out", output,
                           out: File::NULL, err: File::NULL)
        else
          # Linux/other: try ImageMagick convert
          if system("which convert > /dev/null 2>&1")
            success = system("convert", input, "-resize", "#{max_width}x>",
                             output, out: File::NULL, err: File::NULL)
          end
        end

        return nil unless success && File.exist?(output) && File.size(output) > 0

        result    = Base64.strict_encode64(File.binread(output))
        after_kb  = result.bytesize / 1024
        Clacky::Logger.debug("image_downscaled",
          format: ext,
          from: "#{before_kb}KB",
          to:   "#{after_kb}KB (max #{max_width}px wide)")
        result
      end
    rescue => e
      Clacky::Logger.debug("image_downscale_skipped", mime: mime_type, reason: e.message)
      nil
    end

    # Image extensions that can be inlined as data URLs in markdown content.
    LOCAL_IMAGE_EXTENSIONS = %w[.png .jpg .jpeg .gif .webp].freeze

    # Replace local image paths in markdown content with base64 data URLs.
    #
    # Handles both `file:///path/to/img.png` and bare `/path/to/img.png` in
    # markdown image syntax `![alt](src)`.
    #
    # @param content [String] markdown text potentially containing local image references
    # @return [String] content with local images replaced by data URLs
    def self.inline_local_images(content)
      return content if content.nil? || content.empty?

      content.gsub(%r{(!\[[^\]]*\])\((file://)?(/[^)]+)\)}) do
        prefix     = $1
        _scheme    = $2
        raw_path   = $3
        path       = CGI.unescape(raw_path)
        ext        = File.extname(path).downcase
        full_match = $&

        unless LOCAL_IMAGE_EXTENSIONS.include?(ext) && File.exist?(path)
          next full_match
        end

        begin
          data_url = image_path_to_data_url(path)
          Clacky::Logger.info("file_processor.inline_local_images", path: path, size: File.size(path))
          "#{prefix}(#{data_url})"
        rescue StandardError => e
          Clacky::Logger.warn("file_processor.inline_local_images.failed", path: path, error: e.message)
          full_match
        end
      end
    end

    private_class_method :parse_zip_listing, :parse_tar_listing, :save_preview, :sanitize_filename,
                         :downscale_png_chunky, :downscale_via_cli

    # -------------------------------------------------------------------------
    # Local image URL rewriting
    # -------------------------------------------------------------------------

    # Rewrite local image paths in markdown content to use the /api/local-image proxy.
    #
    # Matches two patterns inside `![alt](url)`:
    #   1. file:// URLs  →  ![alt](/api/local-image?path=file:///abs/path.png)
    #   2. bare absolute paths  →  ![alt](/api/local-image?path=/abs/path.png)
    #
    # https:// URLs and non-image files are left untouched.
    #
    # @param content [String, nil] markdown text
    # @return [String, nil] rewritten content (or original if nothing matched)
    def self.rewrite_local_image_urls(content)
      return content if content.nil? || content.empty?

      content.gsub(/!\[([^\]]*)\]\(((?:file:\/\/)?\/[^)]+)\)/) do |match|
        alt  = Regexp.last_match(1)
        href = Regexp.last_match(2)

        # Extract the filesystem path from the href
        path = href.sub(%r{\Afile://}, "")
        path = CGI.unescape(path)

        ext = File.extname(path).downcase
        if LOCAL_IMAGE_EXTENSIONS.include?(ext) && File.exist?(path)
          encoded = CGI.escape(href)
          "![#{alt}](/api/local-image?path=#{encoded})"
        else
          match # return original match unchanged
        end
      end
    end
  end
  end
end