class HexaPDF::CLI::Inspect
Shows the internal structure of a PDF file.
def execute(file, *commands) #:nodoc:
def execute(file, *commands) #:nodoc: with_document(file, password: @password) do |doc| doc.config['font.on_missing_unicode_mapping'] = lambda do |code, font| $stderr.puts("No Unicode mapping for code point #{code} in font #{font[:BaseFont]}, " \ "using the Unicode replacement character") "\u{FFFD}" end @doc = doc if commands.empty? begin require 'reline' Reline.completion_proc = RELINE_COMPLETION_PROC Reline.completion_append_character = " " rescue LoadError if command_parser.verbosity_info? $stderr.puts("Library reline not available, history and line editing not available") end end while true input = read_input (puts; break) unless input commands = input.scan(/(["'])(.+?)\1|(\S+)/).map {|a| a[1] || a[2] } break if execute_commands(commands) end else execute_commands(commands) end end end
def execute_commands(data) #:nodoc:
def execute_commands(data) #:nodoc: data.map! {|item| item == ";" ? nil : item } until data.empty? command = data.shift || next case command when /^\d+(,\d+)?$/, 'o', 'object' arg = (command.start_with?('o') ? data.shift : command) obj = pdf_object_from_string_reference(arg) rescue $stderr.puts($!.message) if obj&.data&.stream && command_parser.verbosity_info? $stderr.puts("Note: Object also has stream data") end serialize(obj.value, recursive: false) if obj when 'r', 'recursive' obj = if (obj = data.shift) pdf_object_from_string_reference(obj) rescue $stderr.puts($!.message) else @doc.trailer end serialize(obj.value, recursive: true) if obj when 's', 'stream', 'raw', 'raw-stream', 'sd' if (obj = pdf_object_from_string_reference(data.shift) rescue $stderr.puts($!.message)) && obj.kind_of?(HexaPDF::Stream) if command == 'sd' if obj.respond_to?(:process_contents) obj.process_contents(ContentProcessor.new) else $stderr.puts("Error: The object is not a Form XObject or page") end else source = (command.start_with?('raw') ? obj.stream_source : obj.stream_decoder) while source.alive? && (stream_data = source.resume) $stdout.write(stream_data) end end elsif command_parser.verbosity_info? $stderr.puts("Note: Object has no stream data") end when 'x', 'xref' if (obj = pdf_object_from_string_reference(data.shift) rescue $stderr.puts($!.message)) @doc.revisions.reverse_each do |rev| if (xref = rev.xref(obj)) puts xref break end end end when 'c', 'catalog' serialize(@doc.catalog.value, recursive: false) when 't', 'trailer' serialize(@doc.trailer.value, recursive: false) when 'p', 'pages' begin pages = parse_pages_specification(data.shift || '1-e', @doc.pages.count) rescue StandardError => e $stderr.puts("Error: #{e}") next end page_list = @doc.pages.to_a pages.each do |index, _| page = page_list[index] str = +"page #{index + 1} (#{page.oid},#{page.gen}): " str << Array(page[:Contents]).map {|c| "#{c.oid},#{c.gen}" }.join(" ") puts str end when 'po', 'ps', 'psd' page_number_str = data.shift unless page_number_str $stderr.puts("Error: Missing PAGE argument to #{command}") next end page_number = parse_pages_specification(page_number_str, @doc.pages.count).first&.first unless page_number $stderr.puts("Error: Invalid page number #{page_number_str}") next end page = @doc.pages[page_number] case command when 'ps' $stdout.write(page.contents) when 'psd' page.process_contents(ContentProcessor.new) else puts "#{page.oid} #{page.gen} obj" serialize(page.value, recursive: false) puts "endobj" end when 'pc', 'page-count' puts @doc.pages.count when 'search' regexp = data.shift unless regexp $stderr.puts("Error: Missing argument regexp") next end re = Regexp.new(regexp, Regexp::IGNORECASE) @doc.each do |object| if @serializer.serialize(object.value).match?(re) puts "#{object.oid} #{object.gen} obj" serialize(object.value, recursive: false) puts "endobj" end end when 'rev', 'revision' if (rev_index = data.shift) rev_index = rev_index.to_i - 1 if rev_index < 0 || rev_index >= @doc.revisions.count $stderr.puts("Error: Invalid revision number specified") next end length = 0 revision_information do |_, index, _, _, end_offset| length = end_offset if index == rev_index end IO.copy_stream(@doc.revisions.parser.io, $stdout, length, 0) else puts "Document has #{@doc.revisions.count} revision#{@doc.revisions.count == 1 ? '' : 's'}" revision_information do |rev, index, count, signature, end_offset| type = if rev.trailer[:XRefStm] "xref table + stream" elsif rev.trailer[:Type] == :XRef "xref stream" else "xref table" end puts "Revision #{index + 1}" puts " Type : #{type}" puts " Objects : #{count}" puts " Size : #{rev.trailer[:Size]}" puts " Signed : yes" if signature puts " Byte range: 0-#{end_offset}" end end when 'q', 'quit' return true when 'h', 'help' puts COMMAND_DESCRIPTIONS.map {|cmd, desc| cmd.ljust(35) << desc }.join("\n") else if command $stderr.puts("Error: Unknown command '#{command}' - enter 'h' for a list of commands") end end end false end
def help_long_desc #:nodoc:
def help_long_desc #:nodoc: output = super summary_width = command_parser.main_options.summary_width data = <<~HELP If a command or an argument is OID[,GEN], object and generation numbers are expected. The generation number defaults to 0 if not given. The available commands are: HELP content = format(data, indent: 0, width: command_parser.help_line_width - command_parser.help_indent) content << "\n\n" COMMAND_DESCRIPTIONS.each do |cmd, desc| content << format(cmd.ljust(summary_width + 1) << desc, width: command_parser.help_line_width - command_parser.help_indent, indent: summary_width + 1, indent_first_line: false) << "\n" end output << cond_format_help_section("Interactive Mode Commands", content, preformatted: true) end
def initialize #:nodoc:
def initialize #:nodoc: super('inspect', takes_commands: false) short_desc("Dig into the internal structure of a PDF file") long_desc(<<~EOF) Inspects a PDF file for debugging or testing purposes. This command is useful when one needs to inspect the internal object structure or a stream of a PDF file. A PDF object is always shown in the PDF syntax. If no arguments are given, the interactive mode is started. Otherwise the arguments are interpreted as interactive mode commands and executed. It is possible to specify more than one command in this way by separating them with semicolons, or whitespace in case the number of command arguments is fixed. EOF options.on("--password PASSWORD", "-p", String, "The password for decryption. Use - for reading from standard input.") do |pwd| @password = (pwd == '-' ? read_password : pwd) end @password = nil @serializer = HexaPDF::Serializer.new end
def pdf_object_from_string_reference(str)
def pdf_object_from_string_reference(str) if str.nil? raise Error, "Error: Missing argument object identifier OID[,GEN]" elsif !str.match?(/^\d+(,\d+)?$/) raise Error, "Error: Invalid argument: Must be of form OID[,GEN], not '#{str}'" elsif !(obj = @doc.object(pdf_reference_from_string(str))) raise Error, "Error: No object with the given object identifier '#{str}' found" else obj end end
def pdf_reference_from_string(str)
def pdf_reference_from_string(str) oid, gen = str.split(",").map(&:to_i) HexaPDF::Reference.new(oid, gen || 0) end
def read_input
def read_input if Object.const_defined?("Reline") Reline.readline("cmd> ", true) else print "cmd> " $stdin.gets end end
def revision_information
- The signature dictionary if this revision was signed
- The number of objects in the revision
- The index of the revision in terms of all revisions of the document
- The revision object itself
Returns an array of arrays that include the following information:
Yields information about the document's revisions.
def revision_information signatures = @doc.signatures.to_h do |sig| [@doc.revisions.find {|rev| rev.object(sig) == sig }, sig] end io = @doc.revisions.parser.io io.seek(0, IO::SEEK_END) startxrefs = @doc.revisions.map {|rev| rev.trailer[:Prev].to_i } << @doc.revisions.parser.startxref_offset << io.pos startxrefs.sort! startxrefs.shift @doc.revisions.each_with_index.map do |rev, index| end_index = 0 sig = signatures[rev] if sig end_index = sig[:ByteRange][-2] + sig[:ByteRange][-1] else io.seek(startxrefs[index], IO::SEEK_SET) buffer = ''.b while io.pos < startxrefs[index + 1] buffer << io.read(1_000) if (buffer_index = buffer.index(/(?:\n|\r\n?)\s*%%EOF\s*(?:\n|\r\n?)?/)) end_index = io.pos - buffer.size + buffer_index + $~[0].size break end buffer = buffer[-20..-1] end end yield(rev, index, rev.each.count, sig, end_index) end end
def serialize(val, recursive: true, seen: {}, indent: 0) #:nodoc:
specially generated PDF references.
object tree is printed, with object references to already printed objects replaced by
Prints the serialized value to the standard output. If +recursive+ is +true+, then the whole
def serialize(val, recursive: true, seen: {}, indent: 0) #:nodoc: case val when Hash puts "<<" (recursive ? val.sort : val).each do |k, v| next if v.nil? || (v.respond_to?(:null?) && v.null?) print '%s%s ' % [' ' * (indent + 1), @serializer.serialize_symbol(k)] serialize(v, recursive: recursive, seen: seen, indent: indent + 1) puts end print "#{' ' * indent}>>" when Array print "[" val.each do |v| serialize(v, recursive: recursive, seen: seen, indent: indent) print " " end print "]" when HexaPDF::Reference serialize(@doc.object(val), recursive: recursive, seen: seen, indent: indent) when HexaPDF::Object if !recursive if val.indirect? print "#{val.oid} #{val.gen} R" else serialize(val.value, recursive: recursive, seen: seen, indent: indent) end elsif val.nil? || seen.key?(val.data) print "{ref #{seen[val.data]}}" else seen[val.data] = (val.type == :Page ? "page #{val.index + 1}" : seen.length + 1) print "{obj #{seen[val.data]}} " serialize(val.value, recursive: recursive, seen: seen, indent: indent) end else print @serializer.serialize(val) end puts if indent == 0 end
def usage_arguments #:nodoc:
def usage_arguments #:nodoc: "FILE [[CMD [ARGS]]...]" end