# coding: utf-8# typed: true# frozen_string_literal: truerequire'forwardable'require'pdf/reader/page_layout'modulePDFclassReader# Builds a UTF-8 string of all the text on a single page by processing all# the operaters in a content stream.#classPageTextReceiverextendForwardableSPACE=" "attr_reader:state,:options########## BEGIN FORWARDERS ########### Graphics State Operatorsdef_delegators:@state,:save_graphics_state,:restore_graphics_state# Matrix Operatorsdef_delegators:@state,:concatenate_matrix# Text Object Operatorsdef_delegators:@state,:begin_text_object,:end_text_object# Text State Operatorsdef_delegators:@state,:set_character_spacing,:set_horizontal_text_scalingdef_delegators:@state,:set_text_font_and_size,:font_sizedef_delegators:@state,:set_text_leading,:set_text_rendering_modedef_delegators:@state,:set_text_rise,:set_word_spacing# Text Positioning Operatorsdef_delegators:@state,:move_text_position,:move_text_position_and_set_leadingdef_delegators:@state,:set_text_matrix_and_text_line_matrix,:move_to_start_of_next_line########## END FORWARDERS ########### starting a new pagedefpage=(page)@state=PageState.new(page)@page=page@content=[]@characters=[]enddefruns(opts={})runs=@charactersifrect=opts.fetch(:rect,@page.rectangles[:CropBox])runs=BoundingRectangleRunsFilter.runs_within_rect(runs,rect)endifopts.fetch(:skip_zero_width,true)runs=ZeroWidthRunsFilter.exclude_zero_width_runs(runs)endifopts.fetch(:skip_overlapping,true)runs=OverlappingRunsFilter.exclude_redundant_runs(runs)endruns=NoTextFilter.exclude_empty_strings(runs)ifopts.fetch(:merge,true)runs=merge_runs(runs)endif(only_filter=opts.fetch(:only,nil))runs=AdvancedTextRunFilter.only(runs,only_filter)endif(exclude_filter=opts.fetch(:exclude,nil))runs=AdvancedTextRunFilter.exclude(runs,exclude_filter)endrunsend# deprecateddefcontentmediabox=@page.rectangles[:MediaBox]PageLayout.new(runs,mediabox).to_send###################################################### Text Showing Operators###################################################### record text that is drawn on the pagedefshow_text(string)# Tj (AWAY)internal_show_text(string)enddefshow_text_with_positioning(params)# TJ [(A) 120 (WA) 20 (Y)]params.eachdo|arg|ifarg.is_a?(String)internal_show_text(arg)elsifarg.is_a?(Numeric)@state.process_glyph_displacement(0,arg,false)else# skip itendendenddefmove_to_next_line_and_show_text(str)# '@state.move_to_start_of_next_lineshow_text(str)enddefset_spacing_next_line_show_text(aw,ac,string)# "@state.set_word_spacing(aw)@state.set_character_spacing(ac)move_to_next_line_and_show_text(string)end###################################################### XObjects#####################################################definvoke_xobject(label)@state.invoke_xobject(label)do|xobj|casexobjwhenPDF::Reader::FormXObjectthenxobj.walk(self)endendendprivatedefinternal_show_text(string)PDF::Reader::Error.validate_type_as_malformed(string,"string",String)if@state.current_font.nil?raisePDF::Reader::MalformedPDFError,"current font is invalid"endglyphs=@state.current_font.unpack(string)glyphs.each_with_indexdo|glyph_code,index|# paint the current glyphnewx,newy=@state.trm_transform(0,0)newx,newy=apply_rotation(newx,newy)utf8_chars=@state.current_font.to_utf8(glyph_code)# apply to glyph displacment for the current glyph so the next# glyph will appear in the correct positionglyph_width=@state.current_font.glyph_width_in_text_space(glyph_code)th=1scaled_glyph_width=glyph_width*@state.font_size*thunlessutf8_chars==SPACE@characters<<TextRun.new(newx,newy,scaled_glyph_width,@state.font_size,utf8_chars)end@state.process_glyph_displacement(glyph_width,0,utf8_chars==SPACE)endenddefapply_rotation(x,y)if@page.rotate==90tmp=xx=yy=tmp*-1elsif@page.rotate==180y*=-1x*=-1elsif@page.rotate==270tmp=yy=xx=tmp*-1endreturnx,yend# take a collection of TextRun objects and merge any that are in close# proximitydefmerge_runs(runs)runs.group_by{|char|char.y.to_i}.map{|y,chars|group_chars_into_runs(chars.sort)}.flatten.sortenddefgroup_chars_into_runs(chars)chars.each_with_object([])do|char,runs|ifruns.empty?runs<<charelsifruns.last.mergable?(char)runs[-1]=runs.last+charelseruns<<charendendendendendend