class Mindee::Extraction::PdfExtractor::PdfExtractor

Pdf extraction class.

def cut_pages(page_indexes)

Returns:
  • (StreamIO) - The buffer containing the new Pdf.

Parameters:
  • page_indexes (Array) -- List of page number to use for merging in the original Pdf.
def cut_pages(page_indexes)
  options = {
    page_indexes: page_indexes,
  }
  Mindee::PDF::PdfProcessor.parse(@source_pdf, options)
end

def extract_invoices(page_indexes, strict: false)

Returns:
  • (Array) -

Parameters:
  • strict (Boolean) --
  • page_indexes (Array, InvoiceSplitterV1PageGroup>) --
def extract_invoices(page_indexes, strict: false)
  raise 'No indexes provided.' if page_indexes.empty?
  unless page_indexes[0].is_a?(Mindee::Product::InvoiceSplitter::InvoiceSplitterV1PageGroup)
    return extract_sub_documents(page_indexes)
  end
  return extract_sub_documents(page_indexes.map(&:page_indexes)) unless strict
  correct_page_indexes = []
  current_list = []
  previous_confidence = nil
  page_indexes.each_with_index do |page_index, i|
    confidence = page_index.confidence
    page_list = page_index.page_indexes
    if confidence >= 0.5 && previous_confidence.nil?
      current_list = page_list
    elsif confidence >= 0.5 && i < page_indexes.length - 1
      correct_page_indexes << current_list
      current_list = page_list
    elsif confidence < 0.5 && i == page_indexes.length - 1
      current_list.concat page_list
      correct_page_indexes << current_list
    else
      correct_page_indexes << current_list
      correct_page_indexes << page_list
    end
    previous_confidence = confidence
  end
  extract_sub_documents(correct_page_indexes)
end

def extract_sub_documents(page_indexes)

Returns:
  • (Array) - The buffer containing the new Pdf.

Parameters:
  • page_indexes (Array>) -- List of page number to use for merging in the original Pdf.
def extract_sub_documents(page_indexes)
  extracted_pdfs = []
  extension = File.extname(@filename)
  basename = File.basename(@filename, extension)
  page_indexes.each do |page_index_list|
    if page_index_list.empty? || page_index_list.nil?
      raise "Empty indexes aren't allowed for extraction #{page_index_list}"
    end
    page_index_list.each do |page_index|
      raise "Index #{page_index} is out of range." if (page_index > page_count) || page_index.negative?
    end
    formatted_max_index = format('%03d', page_index_list[page_index_list.length - 1] + 1).to_s
    field_filename = "#{basename}_#{format('%03d',
                                           (page_index_list[0] + 1))}-#{formatted_max_index}#{extension}"
    extracted_pdf = Mindee::Extraction::PdfExtractor::ExtractedPdf.new(cut_pages(page_index_list),
                                                                       field_filename)
    extracted_pdfs << extracted_pdf
  end
  extracted_pdfs
end

def initialize(local_input)

Parameters:
  • local_input (Mindee::Input::Source::LocalInputSource) --
def initialize(local_input)
  @filename = local_input.filename
  if local_input.pdf?
    @source_pdf = local_input.io_stream
  else
    pdf_image = Extraction::ImageExtractor.attach_image_as_new_file(local_input.io_stream)
    io_buffer = StringIO.new
    pdf_image.save(io_buffer)
    @source_pdf = io_buffer
  end
end

def page_count

Returns:
  • (Integer) -
def page_count
  Mindee::PDF::PdfProcessor.open_pdf(@source_pdf).pages.size
end