lib/mindee/input/sources/local_input_source.rb



# frozen_string_literal: true

require 'stringio'
require 'marcel'

require_relative '../../pdf'
require_relative '../../image'

module Mindee
  module Input
    # Document source handling.
    module Source
      # Mime types accepted by the server.
      ALLOWED_MIME_TYPES = [
        'application/pdf',
        'image/heic',
        'image/png',
        'image/jpeg',
        'image/tiff',
        'image/webp',
      ].freeze

      # Standard error for invalid mime types
      class MimeTypeError < StandardError
      end

      # Error sent if the file's mimetype isn't allowed
      class InvalidMimeTypeError < MimeTypeError
        # @return [String]
        attr_reader :invalid_mimetype

        # @param mime_type [String]
        def initialize(mime_type)
          @invalid_mimetype = mime_type
          super("'#{@invalid_mimetype}' mime type not allowed, must be one of #{ALLOWED_MIME_TYPES.join(', ')}")
        end
      end

      # Error sent if a pdf file couldn't be fixed
      class UnfixablePDFError < MimeTypeError
        def initialize
          super("Corrupted PDF couldn't be repaired.")
        end
      end

      # Base class for loading documents.
      class LocalInputSource
        # @return [String]
        attr_reader :filename
        # @return [String]
        attr_reader :file_mimetype
        # @return [StringIO]
        attr_reader :io_stream

        # @param io_stream [StringIO]
        # @param filename [String]
        # @param fix_pdf [Boolean]
        def initialize(io_stream, filename, fix_pdf: false)
          @io_stream = io_stream
          @filename = filename
          @file_mimetype = if fix_pdf
                             Marcel::MimeType.for @io_stream
                           else
                             Marcel::MimeType.for @io_stream, name: @filename
                           end
          return if ALLOWED_MIME_TYPES.include? @file_mimetype

          if filename.end_with?('.pdf') && fix_pdf
            rescue_broken_pdf(@io_stream)
            @file_mimetype = Marcel::MimeType.for @io_stream

            return if ALLOWED_MIME_TYPES.include? @file_mimetype
          end

          raise InvalidMimeTypeError, @file_mimetype.to_s
        end

        # Attempts to fix pdf files if mimetype is rejected.
        # "Broken PDFs" are often a result of third-party injecting invalid headers.
        # This attempts to remove them and send the file
        # @param stream [StringIO]
        def rescue_broken_pdf(stream)
          stream.gets('%PDF-')
          raise UnfixablePDFError if stream.eof? || stream.pos > 500

          stream.pos = stream.pos - 5
          data = stream.read
          @io_stream.close

          @io_stream = StringIO.new
          @io_stream << data
        end

        # Shorthand for pdf mimetype validation.
        def pdf?
          @file_mimetype.to_s == 'application/pdf'
        end

        # Parses a PDF file according to provided options.
        # @param options [Hash, nil] Page cutting/merge options:
        #
        #  * `:page_indexes` Zero-based list of page indexes.
        #  * `:operation` Operation to apply on the document, given the `page_indexes specified:
        #      * `:KEEP_ONLY` - keep only the specified pages, and remove all others.
        #      * `:REMOVE` - remove the specified pages, and keep all others.
        #  * `:on_min_pages` Apply the operation only if document has at least this many pages.
        def process_pdf(options)
          @io_stream.seek(0)
          @io_stream = PdfProcessor.parse(@io_stream, options)
        end

        # Reads a document.
        # @param close [Boolean]
        # @return [Array<String, [String, aBinaryString ], [Hash, nil] >]
        def read_document(close: true)
          @io_stream.seek(0)
          # Avoids needlessly re-packing some files
          data = @io_stream.read
          @io_stream.close if close
          ['document', data, { filename: Mindee::Input::Source.convert_to_unicode_escape(@filename) }]
        end

        def count_pdf_pages
          return 1 unless pdf?

          @io_stream.seek(0)
          pdf_processor = Mindee::PDF::PdfProcessor.open_pdf(@io_stream)
          pdf_processor.pages.size
        end

        # Compresses the file, according to the provided info.
        # @param [Integer] quality Quality of the output file.
        # @param [Integer, nil] max_width Maximum width (Ignored for PDFs).
        # @param [Integer, nil] max_height Maximum height (Ignored for PDFs).
        # @param [Boolean] force_source_text Whether to force the operation on PDFs with source text.
        #   This will attempt to re-render PDF text over the rasterized original. If disabled, ignored the operation.
        #   WARNING: this operation is strongly discouraged.
        # @param [Boolean] disable_source_text If the PDF has source text, whether to re-apply it to the original or
        #   not. Needs force_source_text to work.
        def compress!(quality: 85, max_width: nil, max_height: nil, force_source_text: false, disable_source_text: true)
          buffer = if pdf?
                     Mindee::PDF::PDFCompressor.compress_pdf(
                       @io_stream,
                       quality: quality,
                       force_source_text_compression: force_source_text,
                       disable_source_text: disable_source_text
                     )
                   else
                     Mindee::Image::ImageCompressor.compress_image(
                       @io_stream,
                       quality: quality,
                       max_width: max_width,
                       max_height: max_height
                     )
                   end
          @io_stream = buffer
          @io_stream.rewind
        end

        # Checks whether the file has source text if it is a pdf. False otherwise
        # @return [Boolean] True if the file is a PDF and has source text.
        def source_text?
          Mindee::PDF::PDFTools.source_text?(@io_stream)
        end
      end

      # Replaces non-ASCII characters by their UNICODE escape sequence.
      # Keeps other characters as is.
      # @return A clean String.
      def self.convert_to_unicode_escape(string)
        unicode_escape_string = ''.dup
        string.each_char do |char|
          unicode_escape_string << if char.bytesize > 1
                                     "\\u#{char.unpack1('U').to_s(16).rjust(4, '0')}"
                                   else
                                     char
                                   end
        end
        unicode_escape_string
      end
    end
  end
end