lib/mindee/pdf/pdf_processor.rb



# frozen_string_literal: true

require 'set'
require 'origami'
require_relative 'pdf_tools'

module Mindee
  module PDF
    # PDF document processing
    module PDFProcessor
      Origami::PDF.class_eval { include PDFTools }
      # @param io_stream [StreamIO]
      # @param options [PageOptions, Hash]
      # @return [StringIO]
      def self.parse(io_stream, options)
        current_pdf = open_pdf(io_stream)
        pages_count = current_pdf.pages.size
        return current_pdf.to_io_stream if options.on_min_pages.to_i > pages_count

        all_pages = (0..pages_count - 1).to_a

        if options.operation == :KEEP_ONLY
          pages_to_remove = indexes_from_keep(options.page_indexes, all_pages)
        elsif options.operation == :REMOVE
          pages_to_remove = indexes_from_remove(options.page_indexes, all_pages)
        else
          raise ArgumentError, "operation must be one of :KEEP_ONLY or :REMOVE, sent '#{options.operation}'"
        end

        current_pdf.delete_pages_at(pages_to_remove) if pages_to_remove.to_a != all_pages.to_a
        current_pdf.to_io_stream
      end

      # @param page_indexes [Array]
      # @param all_pages [Array]
      def self.indexes_from_keep(page_indexes, all_pages)
        pages_to_keep = Set.new
        page_indexes.each do |idx|
          idx = (all_pages.length - (idx + 2)) if idx.negative?
          page = all_pages[idx]
          next if page.nil?

          pages_to_keep << page
        end
        all_pages.to_set - pages_to_keep
      end

      # @param page_indexes [Array[Integer]]
      # @param all_pages [Array]
      def self.indexes_from_remove(page_indexes, all_pages)
        pages_to_remove = Set.new
        page_indexes.each do |idx|
          idx = (all_pages.length - (idx + 2)) if idx.negative?
          page = all_pages[idx]
          next if page.nil?

          pages_to_remove << page
        end
      end

      # @param io_stream [StringIO]
      # @return [Origami::PDF]
      def self.open_pdf(io_stream)
        pdf_parser = Origami::PDF::LinearParser.new({ verbosity: Origami::Parser::VERBOSE_QUIET })
        io_stream.seek(0)
        pdf_parser.parse(io_stream)
      end

      # Retrieves a PDF document's page.
      #
      # @param [Origami::PDF] pdf_doc Origami PDF handle.
      # @param [Integer] page_id Page ID.
      # @return [StringIO]
      def self.get_page(pdf_doc, page_id)
        stream = StringIO.new
        pdf_doc.save(stream)

        options = PageOptions.new(params: {
                                    page_indexes: [page_id - 1],
                                  })

        parse(stream, options)
      end
    end
  end
end