module Clacky::Utils::FileProcessor
def self.binary_file_path?(path)
def self.binary_file_path?(path) ext = File.extname(path).downcase return true if BINARY_EXTENSIONS.include?(ext) File.binread(path, 512).to_s.include?("\x00") rescue false end
def self.detect_image_mime_type(data, fallback_mime = "image/png")
-
(String)- detected MIME type (e.g. "image/png", "image/jpeg")
Parameters:
-
fallback_mime(String) -- MIME type from extension, used as fallback -
data(String) -- raw binary data (first 12 bytes is sufficient)
def self.detect_image_mime_type(data, fallback_mime = "image/png") return fallback_mime if data.nil? || data.bytesize < 4 bytes = data.bytes case # PNG: \x89 P N G \r \n \x1a \n when bytes[0] == 0x89 && bytes[1] == 0x50 && bytes[2] == 0x4E && bytes[3] == 0x47 "image/png" # JPEG: \xFF \xD8 \xFF when bytes[0] == 0xFF && bytes[1] == 0xD8 && bytes[2] == 0xFF "image/jpeg" # GIF: GIF87a or GIF89a when bytes[0] == 0x47 && bytes[1] == 0x49 && bytes[2] == 0x46 && bytes[3] == 0x38 "image/gif" # WEBP: RIFF .... WEBP when bytes[0] == 0x52 && bytes[1] == 0x49 && bytes[2] == 0x46 && bytes[3] == 0x46 && data.bytesize >= 12 && data[8, 4] == "WEBP" "image/webp" # BMP: BM when bytes[0] == 0x42 && bytes[1] == 0x4D "image/bmp" # TIFF: II*\x00 (little-endian) or MM\x00* (big-endian) when (bytes[0] == 0x49 && bytes[1] == 0x49 && bytes[2] == 0x2A && bytes[3] == 0x00) || (bytes[0] == 0x4D && bytes[1] == 0x4D && bytes[2] == 0x00 && bytes[3] == 0x2A) "image/tiff" else fallback_mime end end
def self.detect_mime_type(path, _data = nil)
def self.detect_mime_type(path, _data = nil) MIME_TYPES[File.extname(path).downcase] || "application/octet-stream" end
def self.downscale_image_base64(b64, mime_type, max_width: IMAGE_MAX_WIDTH)
-
(String)- base64-encoded (possibly downscaled) image data
Parameters:
-
max_width(Integer) -- maximum output width in pixels (default: IMAGE_MAX_WIDTH) -
mime_type(String) -- e.g. "image/png", "image/jpeg", "image/webp" -
b64(String) -- base64-encoded image data
def self.downscale_image_base64(b64, mime_type, max_width: IMAGE_MAX_WIDTH) require "base64" result = if mime_type == "image/png" downscale_png_chunky(b64, max_width) else downscale_via_cli(b64, mime_type, max_width) end return result if result # No resize tool available — enforce API hard size limit (5MB) if b64.bytesize > IMAGE_MAX_BASE64_BYTES size_kb = b64.bytesize / 1024 limit_mb = IMAGE_MAX_BASE64_BYTES / 1_000_000 raise ArgumentError, "Image too large to send (#{size_kb}KB > #{limit_mb}MB). " \ "Install ImageMagick (`brew install imagemagick`) to enable automatic resizing." end b64 end
def self.downscale_png_chunky(b64, max_width)
Downscale a PNG using chunky_png (pure Ruby — always available).
def self.downscale_png_chunky(b64, max_width) require "chunky_png" require "base64" image = ChunkyPNG::Image.from_blob(Base64.strict_decode64(b64)) return b64 if image.width <= max_width src_w, src_h = image.width, image.height dst_h = (src_h * max_width.to_f / src_w).round image.resample_nearest_neighbor!(max_width, dst_h) before_kb = b64.bytesize / 1024 result = Base64.strict_encode64(image.to_blob) after_kb = result.bytesize / 1024 Clacky::Logger.debug("image_downscaled", format: "png", from: "#{src_w}x#{src_h} (#{before_kb}KB)", to: "#{max_width}x#{dst_h} (#{after_kb}KB)") result rescue => e Clacky::Logger.debug("image_downscale_skipped", format: "png", reason: e.message) nil end
def self.downscale_via_cli(b64, mime_type, max_width)
Linux → convert (ImageMagick, must be installed)
macOS → sips (built-in, no extra deps)
Downscale a non-PNG image using CLI tools:
def self.downscale_via_cli(b64, mime_type, max_width) require "base64" require "tmpdir" ext = mime_type.split("/").last ext = "jpg" if ext == "jpeg" # Write input to a temp file Dir.mktmpdir("clacky-img") do |dir| input = File.join(dir, "input.#{ext}") output = File.join(dir, "output.#{ext}") File.binwrite(input, Base64.strict_decode64(b64)) before_kb = b64.bytesize / 1024 success = false if RUBY_PLATFORM.include?("darwin") # macOS: sips is always available success = system("sips", "-Z", max_width.to_s, input, "--out", output, out: File::NULL, err: File::NULL) else # Linux/other: try ImageMagick convert if system("which convert > /dev/null 2>&1") success = system("convert", input, "-resize", "#{max_width}x>", output, out: File::NULL, err: File::NULL) end end return nil unless success && File.exist?(output) && File.size(output) > 0 result = Base64.strict_encode64(File.binread(output)) after_kb = result.bytesize / 1024 Clacky::Logger.debug("image_downscaled", format: ext, from: "#{before_kb}KB", to: "#{after_kb}KB (max #{max_width}px wide)") result end rescue => e Clacky::Logger.debug("image_downscale_skipped", mime: mime_type, reason: e.message) nil end
def self.file_to_base64(path)
def self.file_to_base64(path) require "base64" ext = File.extname(path).downcase size = File.size(path) raise ArgumentError, "File too large: #{path}" if size > MAX_FILE_BYTES ext_mime = MIME_TYPES[ext] || "application/octet-stream" raw_data = File.binread(path) # Detect actual image format from magic bytes (ignore misleading extensions) mime = ext_mime.start_with?("image/") ? detect_image_mime_type(raw_data, ext_mime) : ext_mime data = Base64.strict_encode64(raw_data) # Downscale images before sending to LLM to reduce token cost data = downscale_image_base64(data, mime) if mime.start_with?("image/") { format: ext[1..], mime_type: mime, size_bytes: size, base64_data: data } end
def self.glob_allowed_binary?(path)
def self.glob_allowed_binary?(path) GLOB_ALLOWED_BINARY_EXTENSIONS.include?(File.extname(path).downcase) end
def self.image_path_to_data_url(path)
def self.image_path_to_data_url(path) raise ArgumentError, "Image file not found: #{path}" unless File.exist?(path) size = File.size(path) if size > MAX_IMAGE_BYTES raise ArgumentError, "Image too large (#{size / 1024}KB > #{MAX_IMAGE_BYTES / 1024}KB): #{path}" end require "base64" # Extension-based guess as fallback only ext = File.extname(path).downcase.delete(".") ext_mime = case ext when "jpg", "jpeg" then "image/jpeg" when "png" then "image/png" when "gif" then "image/gif" when "webp" then "image/webp" else "image/#{ext}" end raw_data = File.binread(path) # Detect actual image format from magic bytes (ignore misleading extensions) mime = detect_image_mime_type(raw_data, ext_mime) b64 = Base64.strict_encode64(raw_data) # Downscale images before sending to LLM to reduce token cost b64 = downscale_image_base64(b64, mime) "data:#{mime};base64,#{b64}" end
def self.inline_local_images(content)
-
(String)- content with local images replaced by data URLs
Parameters:
-
content(String) -- markdown text potentially containing local image references
def self.inline_local_images(content) return content if content.nil? || content.empty? content.gsub(%r{(!\[[^\]]*\])\((file://)?(/[^)]+)\)}) do prefix = $1 _scheme = $2 raw_path = $3 path = CGI.unescape(raw_path) ext = File.extname(path).downcase full_match = $& unless LOCAL_IMAGE_EXTENSIONS.include?(ext) && File.exist?(path) next full_match end begin data_url = image_path_to_data_url(path) Clacky::Logger.info("file_processor.inline_local_images", path: path, size: File.size(path)) "#{prefix}(#{data_url})" rescue StandardError => e Clacky::Logger.warn("file_processor.inline_local_images.failed", path: path, error: e.message) full_match end end end
def self.parse_tar_listing(path, ext)
.tar.gz/.tgz → gunzip stream + tar reader
.tar → raw tar reader
Handles:
List entries in a tarball or gzip file.
def self.parse_tar_listing(path, ext) require "rubygems/package" require "zlib" case ext when ".tar" lines = ["# TAR Contents\n"] File.open(path, "rb") do |file| Gem::Package::TarReader.new(file) do |tar| tar.each do |entry| kind = entry.directory? ? "[dir] " : "" size = entry.header.size ? " (#{entry.header.size} bytes)" : "" lines << "- #{kind}#{entry.full_name}#{size}" end end end lines.join("\n") when ".tar.gz", ".tgz" lines = ["# TAR.GZ Contents\n"] File.open(path, "rb") do |file| Zlib::GzipReader.wrap(file) do |gz| Gem::Package::TarReader.new(gz) do |tar| tar.each do |entry| kind = entry.directory? ? "[dir] " : "" size = entry.header.size ? " (#{entry.header.size} bytes)" : "" lines << "- #{kind}#{entry.full_name}#{size}" end end end end lines.join("\n") when ".gz" # Could be gzipped-tar with a misleading extension, or a single-file gzip. # Try tar first; on failure, fall back to single-file metadata. begin lines = ["# TAR.GZ Contents\n"] found_tar = false File.open(path, "rb") do |file| Zlib::GzipReader.wrap(file) do |gz| Gem::Package::TarReader.new(gz) do |tar| tar.each do |entry| found_tar = true kind = entry.directory? ? "[dir] " : "" size = entry.header.size ? " (#{entry.header.size} bytes)" : "" lines << "- #{kind}#{entry.full_name}#{size}" end end end end return lines.join("\n") if found_tar rescue StandardError # fall through to single-file gzip handling end # Single-file gzip: report the original filename (if recorded) and compressed/uncompressed sizes. original_name = nil uncompressed = nil File.open(path, "rb") do |file| Zlib::GzipReader.wrap(file) do |gz| original_name = gz.orig_name # Read fully to get the uncompressed size. Guarded: stop after 64MB # to avoid blowing memory on pathological inputs — the preview only # needs a size estimate, not the content. limit = 64 * 1024 * 1024 total = 0 while (chunk = gz.read(1024 * 1024)) total += chunk.bytesize break if total > limit end uncompressed = total end end lines = ["# GZIP Contents\n"] lines << "- Original filename: #{original_name || "(not recorded)"}" lines << "- Compressed size: #{File.size(path)} bytes" lines << "- Uncompressed size: #{uncompressed} bytes#{uncompressed && uncompressed > 64 * 1024 * 1024 ? " (truncated)" : ""}" lines.join("\n") end rescue => e "# Archive Contents\n(could not list entries: #{e.message})" end
def self.parse_zip_listing(body)
def self.parse_zip_listing(body) lines = ["# ZIP Contents\n"] Zip::InputStream.open(StringIO.new(body)) do |zis| while (entry = zis.get_next_entry) size = entry.size ? " (#{entry.size} bytes)" : "" lines << "- #{entry.name}#{size}" end end lines.join("\n") rescue => e "# ZIP Contents\n(could not list entries: #{e.message})" end
def self.process(body:, filename:)
-
(FileRef)-
def self.process(body:, filename:) saved = save(body: body, filename: filename) process_path(saved[:path], name: saved[:name]) end
def self.process_path(path, name: nil)
-
(FileRef)-
Parameters:
-
name(String) -- Display name (defaults to basename) -
path(String) -- Path to the file on disk
def self.process_path(path, name: nil) name ||= File.basename(path.to_s) # Use compound extension for .tar.gz so it's treated as a tarball, not gzip. basename_lower = name.to_s.downcase ext = if basename_lower.end_with?(".tar.gz") ".tar.gz" else File.extname(path.to_s).downcase end type = FILE_TYPES[ext] || :file case ext when ".zip" body = File.binread(path) preview_content = parse_zip_listing(body) preview_path = save_preview(preview_content, path) FileRef.new(name: name, type: :zip, original_path: path, preview_path: preview_path) when ".tar", ".tar.gz", ".tgz", ".gz" # Archive listing for tarballs and gzip'd files. Provides the LLM a # file-tree preview so it can decide whether to ask the user to # extract them (via the shell tool). begin preview_content = parse_tar_listing(path, ext) preview_path = save_preview(preview_content, path) FileRef.new(name: name, type: :zip, original_path: path, preview_path: preview_path) rescue => e FileRef.new(name: name, type: :zip, original_path: path, parse_error: e.message) end when ".png", ".jpg", ".jpeg", ".gif", ".webp" FileRef.new(name: name, type: :image, original_path: path) when ".csv" # CSV is plain text — the file itself IS the preview. No parser, no copy. # FileReader handles encoding fallback via safe_utf8 when it reads the file. FileRef.new(name: name, type: :csv, original_path: path, preview_path: path) when *TEXT_PREVIEW_EXTENSIONS # Markdown / plain text / log: the file itself IS the preview. # No parser needed, no tmpdir copy — just point preview_path at the original. FileRef.new(name: name, type: :text, original_path: path, preview_path: path) else result = Utils::ParserManager.parse(path) if result[:success] preview_path = save_preview(result[:text], path) FileRef.new(name: name, type: type, original_path: path, preview_path: preview_path) else FileRef.new(name: name, type: type, original_path: path, parse_error: result[:error], parser_path: result[:parser_path]) end end end
def self.rewrite_local_image_urls(content)
-
(String, nil)- rewritten content (or original if nothing matched)
Parameters:
-
content(String, nil) -- markdown text
def self.rewrite_local_image_urls(content) return content if content.nil? || content.empty? content.gsub(/!\[([^\]]*)\]\(((?:file:\/\/)?\/[^)]+)\)/) do |match| alt = Regexp.last_match(1) href = Regexp.last_match(2) # Extract the filesystem path from the href path = href.sub(%r{\Afile://}, "") path = CGI.unescape(path) ext = File.extname(path).downcase if LOCAL_IMAGE_EXTENSIONS.include?(ext) && File.exist?(path) encoded = CGI.escape(href) "" else match # return original match unchanged end end end
def self.sanitize_filename(name)
def self.sanitize_filename(name) # Keep Unicode letters/digits (including CJK), ASCII word chars, dots, hyphens, spaces. # Only strip characters that are unsafe on common filesystems: / \ : * ? " < > | \0 # to_utf8 first: HTTP multipart headers arrive as ASCII-8BIT on Ruby 2.6, # and regex matching against ASCII-8BIT raises "invalid byte sequence in UTF-8". base = File.basename(Clacky::Utils::Encoding.to_utf8(name.to_s)) .gsub(/[\/\\:\*?"<>|\x00]/, '_') .strip base.empty? ? 'upload' : base end
def self.save(body:, filename:)
-
(Hash)- { name: String, path: String }
def self.save(body:, filename:) FileUtils.mkdir_p(UPLOAD_DIR) safe_name = sanitize_filename(filename) dest = File.join(UPLOAD_DIR, "#{SecureRandom.hex(8)}_#{safe_name}") File.binwrite(dest, body) { name: safe_name, path: dest } end
def self.save_image_to_disk(body:, mime_type:, filename: "image.jpg")
Save raw image bytes to disk and return a FileRef.
def self.save_image_to_disk(body:, mime_type:, filename: "image.jpg") FileUtils.mkdir_p(UPLOAD_DIR) safe_name = sanitize_filename(filename) dest = File.join(UPLOAD_DIR, "#{SecureRandom.hex(8)}_#{safe_name}") File.binwrite(dest, body) FileRef.new(name: safe_name, type: :image, original_path: dest) end
def self.save_preview(content, original_path)
def self.save_preview(content, original_path) # Always write previews to a tmpdir-based path to avoid polluting the # user's working directory with .preview.md sidecar files. # Use the same UPLOAD_DIR that uploaded files live in; for on-disk files # outside that dir (e.g. project files opened by file_reader), we still # land in UPLOAD_DIR so the user's tree stays clean. FileUtils.mkdir_p(UPLOAD_DIR) safe_name = File.basename(original_path.to_s).gsub(/[\/\:\*?"<>|\x00]/, "_") dest = File.join(UPLOAD_DIR, "#{SecureRandom.hex(8)}_#{safe_name}.preview.md") File.write(dest, content) dest end
def self.supported_binary_file?(path)
def self.supported_binary_file?(path) LLM_BINARY_EXTENSIONS.include?(File.extname(path).downcase) end