class WWW::Mechanize
puts search_results.body
search_results = agent.submit(search_form)
search_form.field_with(:name => “q”).value = “Hello”
search_form = page.form_with(:name => “f”)
page = agent.get(“www.google.com/”)
agent.user_agent_alias = ‘Mac Safari’
agent = WWW::Mechanize.new { |a| a.log = Logger.new(“mech.log”) }
require ‘logger’
require ‘mechanize’
require ‘rubygems’
== Example
submitted. A history of URL’s is maintained and can be queried.
can follow links, and submit forms. Form fields can be populated and
The Mechanize library is used for automating interaction with a website. It
= Synopsis
def add_to_history(page)
def add_to_history(page) @history.push(page, resolve(page.uri)) history_added.call(page) if history_added end
def auth(user, password)
def auth(user, password) @user = user @password = password end
def back
Equivalent to the browser back button. Returns the most recent page
def back @history.pop end
def click(link)
Clicks the WWW::Mechanize::Link object passed in and returns the
def click(link) referer = link.page rescue referer = nil href = link.respond_to?(:href) ? link.href : (link['href'] || link['src']) get(:url => href, :referer => (referer || current_page())) end
def cookies
def cookies @cookie_jar.to_a end
def current_page
def current_page @history.last end
def delete(url, query_params = {}, options = {})
delete('http://tenderlovemaking.com/', {'q' => 'foo'}, :headers => {})
DELETE to +url+ with +query_params+, and setting +options+:
###
def delete(url, query_params = {}, options = {}) put(url, query_params, options.merge({:verb => :delete})) end
def fetch_page(params)
def fetch_page(params) options = { :request => nil, :response => nil, :connection => nil, :referer => current_page(), :uri => nil, :verb => :get, :agent => self, :redirects => 0, :params => [], :headers => {}, }.merge(params) before_connect = Chain.new([ Chain::URIResolver.new(@scheme_handlers), Chain::ParameterResolver.new, Chain::RequestResolver.new, Chain::ConnectionResolver.new( @connection_cache, @keep_alive, @proxy_addr, @proxy_port, @proxy_user, @proxy_pass ), Chain::SSLResolver.new(@ca_file, @verify_callback, @cert, @key, @pass), Chain::AuthHeaders.new(@auth_hash, @user, @password, @digest), Chain::HeaderResolver.new( @keep_alive, @keep_alive_time, @cookie_jar, @user_agent), Chain::CustomHeaders.new, @pre_connect_hook, ]) before_connect.handle(options) uri = options[:uri] request = options[:request] cur_page = options[:referer] request_data = options[:params] redirects = options[:redirects] http_obj = options[:connection] # Add If-Modified-Since if page is in history if( (page = visited_page(uri)) && page.response['Last-Modified'] ) request['If-Modified-Since'] = page.response['Last-Modified'] end if(@conditional_requests) # Specify timeouts if given http_obj.open_timeout = @open_timeout if @open_timeout http_obj.read_timeout = @read_timeout if @read_timeout http_obj.start unless http_obj.started? # Log specified headers for the request log.info("#{ request.class }: #{ request.path }") if log request.each_header do |k, v| log.debug("request-header: #{ k } => #{ v }") end if log # Send the request attempts = 0 begin response = http_obj.request(request, *request_data) { |r| connection_chain = Chain.new([ Chain::ResponseReader.new(r), Chain::BodyDecodingHandler.new, ]) connection_chain.handle(options) } rescue EOFError, Errno::ECONNRESET, Errno::EPIPE => x log.error("Rescuing EOF error") if log http_obj.finish raise x if attempts >= 2 request.body = nil http_obj.start attempts += 1 retry end after_connect = Chain.new([ @post_connect_hook, Chain::ResponseBodyParser.new(@pluggable_parser, @watch_for_set), Chain::ResponseHeaderHandler.new(@cookie_jar, @connection_cache), ]) after_connect.handle(options) res_klass = options[:res_klass] response_body = options[:response_body] page = options[:page] log.info("status: #{ page.code }") if log if follow_meta_refresh redirect_uri = nil if (page.respond_to?(:meta) && (redirect = page.meta.first)) redirect_uri = redirect.uri.to_s elsif refresh = response['refresh'] parsed_refresh = refresh.match(/^\s*(\d+\.?\d*);\s*(url|URL)=(\S*)\s*$/) raise StandardError, "Invalid refresh http header" unless parsed_refresh delay = parsed_refresh[1] location = parsed_refresh[3] location = "http://#{uri.host}#{location}" unless location.include?("http") if redirects + 1 > redirection_limit raise RedirectLimitReachedError.new(page, redirects) end sleep delay.to_i redirect_uri = location end if redirect_uri @history.push(page, page.uri) return fetch_page( :uri => redirect_uri, :referer => page, :params => [], :verb => :get, :redirects => redirects + 1 ) end end return page if res_klass <= Net::HTTPSuccess if res_klass == Net::HTTPNotModified log.debug("Got cached page") if log return visited_page(uri) || page elsif res_klass <= Net::HTTPRedirection return page unless follow_redirect? log.info("follow redirect to: #{ response['Location'] }") if log from_uri = page.uri raise RedirectLimitReachedError.new(page, redirects) if redirects + 1 > redirection_limit redirect_verb = options[:verb] == :head ? :head : :get page = fetch_page( :uri => response['Location'].to_s, :referer => page, :params => [], :verb => redirect_verb, :redirects => redirects + 1 ) @history.push(page, from_uri) return page elsif res_klass <= Net::HTTPUnauthorized raise ResponseCodeError.new(page) unless @user || @password raise ResponseCodeError.new(page) if @auth_hash.has_key?(uri.host) if response['www-authenticate'] =~ /Digest/i @auth_hash[uri.host] = :digest if response['server'] =~ /Microsoft-IIS/ @auth_hash[uri.host] = :iis_digest end @digest = response['www-authenticate'] else @auth_hash[uri.host] = :basic end return fetch_page( :uri => uri, :referer => cur_page, :verb => request.method.downcase.to_sym, :params => request_data, :headers => options[:headers] ) end raise ResponseCodeError.new(page), "Unhandled response", caller end
def get(options, parameters = [], referer = nil)
def get(options, parameters = [], referer = nil) unless options.is_a? Hash url = options unless parameters.respond_to?(:each) # FIXME: Remove this in 0.8.0 referer = parameters parameters = [] end else raise ArgumentError.new("url must be specified") unless url = options[:url] parameters = options[:params] || [] referer = options[:referer] headers = options[:headers] end unless referer if url =~ /^http/ referer = Page.new(nil, {'content-type'=>'text/html'}) else referer = current_page || Page.new(nil, {'content-type'=>'text/html'}) end end # FIXME: Huge hack so that using a URI as a referer works. I need to # refactor everything to pass around URIs but still support # WWW::Mechanize::Page#base unless referer.is_a?(WWW::Mechanize::File) referer = referer.is_a?(String) ? Page.new(URI.parse(referer), {'content-type' => 'text/html'}) : Page.new(referer, {'content-type' => 'text/html'}) end # fetch the page page = fetch_page( :uri => url, :referer => referer, :headers => headers || {}, :params => parameters ) add_to_history(page) yield page if block_given? page end
def get_file(url)
def get_file(url) get(url).body end
def head(url, query_params = {}, options = {})
head('http://tenderlovemaking.com/', {'q' => 'foo'}, :headers => {})
HEAD to +url+ with +query_params+, and setting +options+:
###
def head(url, query_params = {}, options = {}) put(url, query_params, options.merge({:verb => :head})) end
def initialize
def initialize # attr_accessors @cookie_jar = CookieJar.new @log = nil @open_timeout = nil @read_timeout = nil @user_agent = AGENT_ALIASES['Mechanize'] @watch_for_set = nil @history_added = nil @ca_file = nil # OpenSSL server certificate file # callback for OpenSSL errors while verifying the server certificate # chain, can be used for debugging or to ignore errors by always # returning _true_ @verify_callback = nil @cert = nil # OpenSSL Certificate @key = nil # OpenSSL Private Key @pass = nil # OpenSSL Password @redirect_ok = true # Should we follow redirects? # attr_readers @history = WWW::Mechanize::History.new @pluggable_parser = PluggableParser.new # Auth variables @user = nil # Auth User @password = nil # Auth Password @digest = nil # DigestAuth Digest @auth_hash = {} # Keep track of urls for sending auth # Proxy settings @proxy_addr = nil @proxy_pass = nil @proxy_port = nil @proxy_user = nil @conditional_requests = true @follow_meta_refresh = false @redirection_limit = 20 # Connection Cache & Keep alive @connection_cache = {} @keep_alive_time = 300 @keep_alive = true @scheme_handlers = Hash.new { |h,k| h[k] = lambda { |link, page| raise UnsupportedSchemeError.new(k) } } @scheme_handlers['http'] = lambda { |link, page| link } @scheme_handlers['https'] = @scheme_handlers['http'] @scheme_handlers['relative'] = @scheme_handlers['http'] @scheme_handlers['file'] = @scheme_handlers['http'] @pre_connect_hook = Chain::PreConnectHook.new @post_connect_hook = Chain::PostConnectHook.new yield self if block_given? end
def log; self.class.log end
def log; self.class.log end
def log=(l); self.class.log = l end
def log=(l); self.class.log = l end
def max_history; @history.max_size end
def max_history; @history.max_size end
def max_history=(length); @history.max_size = length end
def max_history=(length); @history.max_size = length end
def post(url, query={})
or
agent.post('http://example.com/', "foo" => "bar")
Example:
parameters can be passed as a hash, or as an array of arrays.
Posts to the given URL wht the query parameters passed in. Query
def post(url, query={}) node = {} # Create a fake form class << node def search(*args); []; end end node['method'] = 'POST' node['enctype'] = 'application/x-www-form-urlencoded' form = Form.new(node) query.each { |k,v| if v.is_a?(IO) form.enctype = 'multipart/form-data' ul = Form::FileUpload.new(k.to_s,::File.basename(v.path)) ul.file_data = v.read form.file_uploads << ul else form.fields << Form::Field.new(k.to_s,v) end } post_form(url, form) end
def post_connect_hooks
def post_connect_hooks @post_connect_hook.hooks end
def post_form(url, form)
def post_form(url, form) cur_page = form.page || current_page || Page.new( nil, {'content-type'=>'text/html'}) request_data = form.request_data log.debug("query: #{ request_data.inspect }") if log # fetch the page page = fetch_page( :uri => url, :referer => cur_page, :verb => :post, :params => [request_data], :headers => { 'Content-Type' => form.enctype, 'Content-Length' => request_data.size.to_s, }) add_to_history(page) page end
def pre_connect_hooks
def pre_connect_hooks @pre_connect_hook.hooks end
def pretty_print(q)
def pretty_print(q) q.object_group(self) { q.breakable q.pp cookie_jar q.breakable q.pp current_page } end
def put(url, query_params = {}, options = {})
put('http://tenderlovemaking.com/', {'q' => 'foo'}, :headers => {})
PUT to +url+ with +query_params+, and setting +options+:
###
def put(url, query_params = {}, options = {}) options = { :uri => url, :headers => {}, :params => query_params, :verb => :put }.merge(options) # fetch the page page = fetch_page(options) add_to_history(page) yield page if block_given? page end
def resolve(url, referer = current_page())
def resolve(url, referer = current_page()) hash = { :uri => url, :referer => referer } chain = Chain.new([ Chain::URIResolver.new(@scheme_handlers) ]).handle(hash) hash[:uri].to_s end
def search(*args); []; end
def search(*args); []; end
def set_proxy(addr, port, user = nil, pass = nil)
Sets the proxy address, port, user, and password
def set_proxy(addr, port, user = nil, pass = nil) @proxy_addr, @proxy_port, @proxy_user, @proxy_pass = addr, port, user, pass end
def submit(form, button=nil)
With a button
agent.submit(page.forms.first)
page = agent.get('http://example.com')
Without a button:
Submit a form with an optional button.
def submit(form, button=nil) form.add_button_to_query(button) if button case form.method.upcase when 'POST' post_form(form.action, form) when 'GET' get( :url => form.action.gsub(/\?[^\?]*$/, ''), :params => form.build_query, :referer => form.page ) else raise "unsupported method: #{form.method.upcase}" end end
def transact
Runs given block, then resets the page history as it was before. self is
def transact history_backup = @history.dup begin yield self ensure @history = history_backup end end
def user_agent_alias=(al)
Set the user agent for the Mechanize object.
def user_agent_alias=(al) self.user_agent = AGENT_ALIASES[al] || raise("unknown agent alias") end
def visited?(url)
def visited?(url) ! visited_page(url).nil? end
def visited_page(url)
def visited_page(url) if url.respond_to? :href url = url.href end @history.visited_page(resolve(url)) end