# frozen_string_literal: truerequire'mini_magick'require'origami'require'stringio'require'tempfile'require_relative'../../input/sources'require_relative'extracted_image'moduleMindee# Image Extraction Module.moduleExtraction# Image Extraction wrapper class.moduleImageExtractordefself.attach_image_as_new_file(input_buffer,format: 'jpg')# Attaches an image as a new page in a PdfDocument object.## @param [StringIO] input_buffer Input buffer. Only supports JPEG.# @return [Origami::PDF] A PdfDocument handle.magick_image=MiniMagick::Image.read(input_buffer)# NOTE: some jpeg images get rendered as three different versions of themselves per output if the format isn't# converted.magick_image.format(format)original_density=magick_image.resolutionscale_factor=original_density[0].to_f/4.166666# No clue why the resolution needs to be reduced for# the pdf otherwise the resulting image shrinks.magick_image.format('pdf',0,{density: scale_factor.to_s})Origami::PDF.read(StringIO.new(magick_image.to_blob))end# Extracts multiple images from a given local input source.## @param [Mindee::Input::Source::LocalInputSource] input_source# @param [Integer] page_id ID of the Page to extract from.# @param [Array<Array<Mindee::Geometry::Point>>, Array<Mindee::Geometry::Quadrangle>] polygons List of coordinates# to extract.# @return [Array<Mindee::Extraction::ExtractedImage>] Extracted Images.defself.extract_multiple_images_from_source(input_source,page_id,polygons)new_stream=load_input_source_pdf_page_as_image(input_source,page_id)new_stream.seek(0)extract_images_from_polygons(input_source,new_stream,page_id,polygons)end# Extracts images from their positions on a file (as polygons).## @param [Mindee::Input::Source::LocalInputSource] input_source Local input source.# @param [StringIO] pdf_stream Buffer of the PDF.# @param [Integer] page_id Page ID.# @param [Array<Mindee::Geometry::Point, Mindee::Geometry::Polygon, Mindee::Geometry::Quadrangle>] polygons# @return [Array<Mindee::Extraction::ExtractedImage>] Extracted Images.defself.extract_images_from_polygons(input_source,pdf_stream,page_id,polygons)extracted_elements=[]polygons.each_with_indexdo|polygon,element_id|polygon=normalize_polygon(polygon)page_content=read_page_content(pdf_stream)min_max_x=Geometry.get_min_max_x([polygon.top_left,polygon.bottom_right,polygon.top_right,polygon.bottom_left,])min_max_y=Geometry.get_min_max_y([polygon.top_left,polygon.bottom_right,polygon.top_right,polygon.bottom_left,])file_extension=determine_file_extension(input_source)cropped_image=crop_image(page_content,min_max_x,min_max_y)iffile_extension=='pdf'cropped_image.format('jpg')elsecropped_image.format(file_extension)endbuffer=StringIO.newwrite_image_to_buffer(cropped_image,buffer)file_name="#{input_source.filename}_page#{page_id}-#{element_id}.#{file_extension}"extracted_elements<<create_extracted_image(buffer,file_name,page_id,element_id)endextracted_elementsend# Retrieves the bounding box of a polygon.## @param [Array<Point>, Mindee::Geometry::Polygon] polygondefself.normalize_polygon(polygon)ifpolygon.is_a?(Mindee::Geometry::Polygon)Mindee::Geometry.get_bounding_box(polygon)elsepolygonendend# Loads a buffer into a MiniMagick Image.## @param [StringIO] pdf_stream Buffer containg the PDF# @return [MiniMagick::Image] a valid MiniMagick image handle.defself.read_page_content(pdf_stream)pdf_stream.rewindMiniMagick::Image.read(pdf_stream)end# Crops a MiniMagick Image from a the given bounding box.## @param [MiniMagick::Image] image Input Image.# @param [Mindee::Geometry::MinMax] min_max_x minimum & maximum values for the x coordinates.# @param [Mindee::Geometry::MinMax] min_max_y minimum & maximum values for the y coordinates.defself.crop_image(image,min_max_x,min_max_y)width=image[:width].to_iheight=image[:height].to_iimage.format('jpg')new_width=(min_max_x.max-min_max_x.min)*widthnew_height=(min_max_y.max-min_max_y.min)*heightimage.crop("#{new_width}x#{new_height}+#{min_max_x.min*width}+#{min_max_y.min*height}")imageend# Writes a MiniMagick::Image to a buffer.## @param [MiniMagick::Image] image a valid MiniMagick image.# @param [StringIO] bufferdefself.write_image_to_buffer(image,buffer)image.write(buffer)end# Retrieves the file extension from the main file to apply it to the extracted images. Note: coerces pdf as jpg.## @param [Mindee::Input::Source::LocalInputSource] input_source Local input source.# @return [String] A valid file extension.defself.determine_file_extension(input_source)ifinput_source.pdf?||input_source.filename.downcase.end_with?('pdf')'jpg'elseFile.extname(input_source.filename).strip.downcase[1..]endend# Generates an ExtractedImage.## @param [StringIO] buffer Buffer containing the image.# @param [String] file_name Name for the file.# @param [Object] page_id ID of the page the file was generated from.# @param [Object] element_id ID of the element of a given page.defself.create_extracted_image(buffer,file_name,page_id,element_id)buffer.rewindExtractedImage.new(Mindee::Input::Source::BytesInputSource.new(buffer.read,file_name),page_id,element_id)end# Loads a single_page from an image file or a pdf document.## @param input_file [LocalInputSource] Local input.# @param [Integer] page_id Page ID.# @return [MiniMagick::Image] A valid PdfDocument handle.defself.load_input_source_pdf_page_as_image(input_file,page_id)input_file.io_stream.rewindifinput_file.pdf?Mindee::PDF::PdfProcessor.get_page(Origami::PDF.read(input_file.io_stream),page_id)elseinput_file.io_streamendendendendend