gem.sh

lib/mindee/pdf/pdf_tools.rb

# frozen_string_literal: true

module Mindee
  module PDF
    # Monkey-patching for Origami
    module PDFTools
      # @return [StringIO]
      def to_io_stream(params = {})
        options = {
          delinearize: true,
          recompile: true,
          decrypt: false,
        }
        options.update(params)

        if frozen? # incompatible flags with frozen doc (signed)
          options[:recompile] = nil
          options[:rebuild_xrefs] = nil
          options[:noindent] = nil
          options[:obfuscate] = false
        end
        load_all_objects unless @loaded

        intents_as_pdfa1 if options[:intent] =~ %r{pdf[/-]?A1?/i}
        delinearize! if options[:delinearize] && linearized?
        compile(options) if options[:recompile]

        io_stream = StringIO.new(output(options))
        io_stream.set_encoding Encoding::BINARY
        io_stream
      end

      # Checks a PDFs stream content for text operators
      # See https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf page 243-251.
      # @param [StringIO] stream Stream object from a PDFs page.
      # @return [Boolean] True if a text operator is found in the stream.
      def self.stream_has_text?(stream)
        data = stream.data
        return false if data.nil? || data.empty?

        text_operators = ['Tc', 'Tw', 'Th', 'TL', 'Tf', 'Tk', 'Tr', 'Tm', 'T*', 'Tj', 'TJ', "'", '"']
        text_operators.any? { |op| data.include?(op) }
      end

      # Checks whether the file has source_text. Sends false if the file isn't a PDF.
      # @param [StringIO] pdf_data
      # @return [Boolean] True if the pdf has source text, false otherwise.
      def self.source_text?(pdf_data)
        begin
          pdf_data.rewind
          pdf = Origami::PDF.read(pdf_data)

          pdf.each_page do |page|
            next unless page[:Contents]

            contents = page[:Contents].solve
            contents = [contents] unless contents.is_a?(Origami::Array)

            contents.each do |stream_ref|
              stream = stream_ref.solve
              return true if stream_has_text?(stream)
            end
          end

          false
        end

        false
      rescue Origami::InvalidPDFError
        false
      end

      def self.create_xobject(image)
        image_io = Mindee::Image::ImageUtils.image_to_stringio(image)
        Origami::Graphics::ImageXObject.from_image_file(image_io, 'jpg')
      end

      def self.set_xobject_properties(xobject, image)
        xobject.dictionary[:BitsPerComponent] = 8
        xobject.dictionary[:Filter] = determine_filter(image)
        xobject.dictionary[:Width] = image[:width]
        xobject.dictionary[:Height] = image[:height]
        xobject.dictionary[:ColorSpace] = determine_colorspace(image)
      end

      def self.determine_filter(image)
        filter = image.data['properties']['filter']
        case filter
        when %r{Zip}i then :FlateDecode
        when %r{LZW}i then :LZWDecode
        else :DCTDecode
        end
      end

      def self.determine_colorspace(image)
        colorspace = image.data['colorspace']
        case colorspace
        when 'CMYK' then :DeviceCMYK
        when 'Gray', 'PseudoClass Gray' then :DeviceGray
        else :DeviceRGB
        end
      end

      def self.add_content_to_page(page, xobject_name, width, height)
        content = "q\n#{width} 0 0 #{height} 0 0 cm\n/#{xobject_name} Do\nQ\n"
        content_stream = Origami::Stream.new(content)
        page.Contents = content_stream
      end

      def self.set_page_dimensions(page, width, height)
        page[:MediaBox] = [0, 0, width, height]
        page[:CropBox] = [0, 0, width, height]
      end

      def self.process_image_xobject(image_data, image_quality, width, height)
        compressed_data = Image::ImageCompressor.compress_image(
          image_data,
          quality: image_quality,
          max_width: width,
          max_height: height
        )

        new_image = Origami::Graphics::ImageXObject.new
        new_image.data = compressed_data
        new_image.Width = width
        new_image.Height = height
        new_image.ColorSpace = :DeviceRGB
        new_image.BitsPerComponent = 8

        new_image
      end
    end
  end
end
Modules

Classes

lib/mindee/pdf/pdf_tools.rb

Source Files