lib/pdf/reader/page_layout.rb



# coding: utf-8
# typed: true
# frozen_string_literal: true

require 'pdf/reader/overlapping_runs_filter'
require 'pdf/reader/zero_width_runs_filter'

class PDF::Reader

  # Takes a collection of TextRun objects and renders them into a single
  # string that best approximates the way they'd appear on a render PDF page.
  #
  # media box should be a 4 number array that describes the dimensions of the
  # page to be rendered as described by the page's MediaBox attribute
  class PageLayout

    DEFAULT_FONT_SIZE = 12

    def initialize(runs, mediabox)
      # mediabox is a 4-element array for now, but it'd be nice to switch to a
      # PDF::Reader::Rectangle at some point
      PDF::Reader::Error.validate_not_nil(mediabox, "mediabox")

      runs = ZeroWidthRunsFilter.exclude_zero_width_runs(runs)
      runs = OverlappingRunsFilter.exclude_redundant_runs(runs)
      @mediabox = mediabox
      @runs = merge_runs(runs)
      @mean_font_size   = mean(@runs.map(&:font_size)) || DEFAULT_FONT_SIZE
      @mean_font_size = DEFAULT_FONT_SIZE if @mean_font_size == 0
      @median_glyph_width = median(@runs.map(&:mean_character_width)) || 0
      @x_offset = @runs.map(&:x).sort.first || 0
      lowest_y = @runs.map(&:y).sort.first || 0
      @y_offset = lowest_y > 0 ? 0 : lowest_y
    end

    def to_s
      return "" if @runs.empty?
      return "" if row_count == 0

      page = row_count.times.map { |i| " " * col_count }
      @runs.each do |run|
        x_pos = ((run.x - @x_offset) / col_multiplier).round
        y_pos = row_count - ((run.y - @y_offset) / row_multiplier).round
        if y_pos <= row_count && y_pos >= 0 && x_pos <= col_count && x_pos >= 0
          local_string_insert(page[y_pos-1], run.text, x_pos)
        end
      end
      interesting_rows(page).map(&:rstrip).join("\n")
    end

    private

    def page_width
      # TODO once @mediabox is a Rectangle, this can be just `@mediabox.width`
      (@mediabox[2].to_f - @mediabox[0].to_f).abs
    end

    def page_height
      # TODO once @mediabox is a Rectangle, this can be just `@mediabox.height`
      (@mediabox[3].to_f - @mediabox[1].to_f).abs
    end

    # given an array of strings, return a new array with empty rows from the
    # beginning and end removed.
    #
    #   interesting_rows([ "", "one", "two", "" ])
    #   => [ "one", "two" ]
    #
    def interesting_rows(rows)
      line_lengths = rows.map { |l| l.strip.length }

      return [] if line_lengths.all?(&:zero?)

      first_line_with_text = line_lengths.index { |l| l > 0 }
      last_line_with_text  = line_lengths.size - line_lengths.reverse.index { |l| l > 0 }
      interesting_line_count = last_line_with_text - first_line_with_text
      rows[first_line_with_text, interesting_line_count].map
    end

    def row_count
      @row_count ||= (page_height / @mean_font_size).floor
    end

    def col_count
      @col_count ||= ((page_width  / @median_glyph_width) * 1.05).floor
    end

    def row_multiplier
      @row_multiplier ||= page_height.to_f / row_count.to_f
    end

    def col_multiplier
      @col_multiplier ||= page_width.to_f / col_count.to_f
    end

    def mean(collection)
      if collection.size == 0
        0
      else
        collection.inject(0) { |accum, v| accum + v} / collection.size.to_f
      end
    end

    def median(collection)
      if collection.size == 0
        0
      else
        collection.sort[(collection.size * 0.5).floor]
      end
    end

    # take a collection of TextRun objects and merge any that are in close
    # proximity
    def merge_runs(runs)
      runs.group_by { |char|
        char.y.to_i
      }.map { |y, chars|
        group_chars_into_runs(chars.sort)
      }.flatten.sort
    end

    def group_chars_into_runs(chars)
      chars.each_with_object([]) do |char, runs|
        if runs.empty?
          runs << char
        elsif runs.last.mergable?(char)
          runs[-1] = runs.last + char
        else
          runs << char
        end
      end
    end

    def local_string_insert(haystack, needle, index)
      haystack[Range.new(index, index + needle.length - 1)] = String.new(needle)
    end
  end
end