#!/usr/bin/env ruby -w
require "optparse"
require "sexp_processor"
require "ruby_parser"
require "path_expander"
require "timeout"
require "zlib"
class Flay
VERSION = "2.13.3" # :nodoc:
class Item < Struct.new(:structural_hash, :name, :bonus, :mass, :locations)
alias identical? bonus
end
class Location < Struct.new(:file, :line, :fuzzy)
alias fuzzy? fuzzy
end
def self.run args = ARGV
extensions = ["rb"] + Flay.load_plugins
glob = "**/*.{#{extensions.join ","}}"
expander = PathExpander.new args, glob
files = expander.filter_files expander.process, DEFAULT_IGNORE
flay = Flay.new Flay.parse_options args
flay.process(*files.sort)
flay
end
##
# Returns the default options.
def self.default_options
{
:diff => false,
:mass => 16,
:summary => false,
:verbose => false,
:number => true,
:timeout => 10,
:liberal => false,
:fuzzy => false,
:only => nil,
:filters => [],
}
end
##
# Process options in +args+, defaulting to +ARGV+.
def self.parse_options args = ARGV
options = self.default_options
OptionParser.new do |opts|
opts.banner = "flay [options] files_or_dirs"
opts.version = Flay::VERSION
opts.separator ""
opts.separator "Specific options:"
opts.separator ""
opts.on("-h", "--help", "Display this help.") do
puts opts
exit
end
opts.on("-f", "--fuzzy [DIFF]", Integer,
"Detect fuzzy (copy & paste) duplication (default 1).") do |n|
options[:fuzzy] = n || 1
end
opts.on("-l", "--liberal", "Use a more liberal detection method.") do
options[:liberal] = true
end
opts.on("-m", "--mass MASS", Integer,
"Sets mass threshold (default = #{options[:mass]})") do |m|
options[:mass] = m.to_i
end
opts.on("-#", "Don't number output (helps with diffs)") do |m|
options[:number] = false
end
opts.on("-v", "--verbose", "Verbose. Show progress processing files.") do
options[:verbose] = true
end
opts.on("-o", "--only NODE", String, "Only show matches on NODE type.") do |s|
options[:only] = s.to_sym
end
opts.on("-d", "--diff", "Diff Mode. Display N-Way diff for ruby.") do
options[:diff] = true
end
opts.on("-s", "--summary", "Summarize. Show flay score per file only.") do
options[:summary] = true
end
opts.on("-t", "--timeout TIME", Integer,
"Set the timeout. (default = #{options[:timeout]})") do |t|
options[:timeout] = t.to_i
end
extensions = ["rb"] + Flay.load_plugins
opts.separator ""
opts.separator "Known extensions: #{extensions.join(", ")}"
extensions.each do |meth|
msg = "options_#{meth}"
send msg, opts, options if self.respond_to?(msg)
end
begin
opts.parse! args
rescue => e
abort "#{e}\n\n#{opts}"
end
end
options
end
# so I can move this to flog wholesale
DEFAULT_IGNORE = ".flayignore" # :nodoc:
##
# Loads all flay plugins. Files must be named "flay_*.rb".
def self.load_plugins
unless defined? @@plugins then
@@plugins = []
plugins = Gem.find_files("flay_*.rb").reject { |p| p =~ /flay_task/ }
plugins.each do |plugin|
plugin_name = File.basename(plugin, ".rb").sub(/^flay_/, "")
next if @@plugins.include? plugin_name
begin
load plugin
@@plugins << plugin_name
rescue LoadError => e
warn "error loading #{plugin.inspect}: #{e.message}. skipping..."
end
end
end
@@plugins
rescue => e
warn "Error loading plugins: #{e}" if option[:verbose]
end
# :stopdoc:
attr_accessor :mass_threshold, :total, :identical, :masses
attr_reader :hashes, :option
# :startdoc:
##
# Create a new instance of Flay with +option+s.
def initialize option = {}
@option = Flay.default_options.merge option
@hashes = Hash.new { |h,k| h[k] = [] }
self.identical = {}
self.masses = {}
self.total = 0
self.mass_threshold = @option[:mass]
end
##
# Process any number of files.
def process(*files) # TODO: rename from process - should act as SexpProcessor
files.each do |file|
warn "Processing #{file}" if option[:verbose]
ext = File.extname(file).sub(/^\./, "")
ext = "rb" if ext.nil? || ext.empty?
msg = "process_#{ext}"
unless respond_to? msg then
warn " Unknown file type: #{ext}, defaulting to ruby"
msg = "process_rb"
end
begin
sexp = begin
send msg, file
rescue => e
warn " #{e.message.strip}"
warn " skipping #{file}"
nil
end
next unless sexp
process_sexp sexp
rescue SyntaxError => e
warn " skipping #{file}: #{e.message}"
end
end
end
##
# Prune, find identical nodes, and update masses.
def analyze filter = nil
self.prune
self.hashes.each do |hash,nodes|
identical[hash] = nodes[1..-1].all? { |n| n == nodes.first }
end
update_masses
sorted = masses.sort_by { |h,m|
exp = hashes[h].first
[-m,
exp.file,
exp.line,
exp.sexp_type.to_s]
}
sorted.map { |hash, mass|
nodes = hashes[hash]
next unless nodes.first.first == filter if filter
same = identical[hash]
node = nodes.first
n = nodes.size
bonus = "*#{n}" if same
locs = nodes.sort_by { |x| [x.file, x.line] }.each_with_index.map { |x, i|
extra = :fuzzy if x.modified?
Location[x.file, x.line, extra]
}
Item[hash, node.sexp_type, bonus, mass, locs]
}.compact
end
##
# Reset total and recalculate the masses for all nodes in +hashes+.
def update_masses
self.total = 0
masses.clear
self.hashes.each do |hash, nodes|
masses[hash] = nodes.first.mass * nodes.size
masses[hash] *= (nodes.size) if identical[hash]
self.total += masses[hash]
end
end
##
# Parse a ruby +file+ and return the sexp.
#
# --
# TODO: change the system and rename this to parse_rb.
def process_rb file
begin
RubyParser.new.process(File.binread(file), file, option[:timeout])
rescue Timeout::Error
warn "TIMEOUT parsing #{file}. Skipping."
end
end
##
# Before processing, filter any sexp's that match against filters
# specified in +option[:filters]+. This changes the sexp itself.
def filter_sexp exp
exp.delete_if { |sexp|
if Sexp === sexp then
del = option[:filters].any? { |pattern| pattern.satisfy? sexp }
del or (filter_sexp(sexp); false)
end
}
end
##
# Process a sexp +pt+.
def process_sexp pt
filter_sexp(pt).deep_each do |node|
next :skip if node.none? { |sub| Sexp === sub }
next :skip if node.mass < self.mass_threshold
self.hashes[node.structural_hash] << node
process_fuzzy node, option[:fuzzy] if option[:fuzzy]
end
end
# :stopdoc:
MAX_NODE_SIZE = 10 # prevents exponential blowout
MAX_AVG_MASS = 12 # prevents exponential blowout
# :startdoc:
##
# Process "fuzzy" matches for +node+. A fuzzy match is a subset of
# +node+ up to +difference+ elements less than the original.
def process_fuzzy node, difference
return unless node.has_code?
avg_mass = node.mass / node.size
return if node.size > MAX_NODE_SIZE or avg_mass > MAX_AVG_MASS
tmpl, code = node.split_code
tmpl.modified = true
(code.size - 1).downto(code.size - difference) do |n|
code.combination(n).each do |subcode|
new_node = tmpl + subcode
next unless new_node.any? { |sub| Sexp === sub }
next if new_node.mass < self.mass_threshold
# they're already structurally similar, don"t bother adding another
next if self.hashes[new_node.structural_hash].any? { |sub|
sub.file == new_node.file and sub.line == new_node.line
}
self.hashes[new_node.structural_hash] << new_node
end
end
end
##
# Prunes nodes that aren't relevant to analysis or are already
# covered by another node. Also deletes nodes based on the
# +:filters+ option.
def prune
# prune trees that aren't duped at all, or are too small
self.hashes.delete_if { |_,nodes| nodes.size == 1 }
self.hashes.delete_if { |_,nodes| nodes.all?(&:modified?) }
if option[:liberal] then
prune_liberally
else
prune_conservatively
end
end
##
# Conservative prune. Remove any bucket that is known to contain a
# subnode element of a node in another bucket.
def prune_conservatively
hashes_to_prune = {}
# extract all subtree hashes from all nodes
self.hashes.values.each do |nodes|
nodes.first.all_structural_subhashes.each do |h|
hashes_to_prune[h] = true
end
end
# nuke subtrees so we show the biggest matching tree possible
self.hashes.delete_if { |h,_| hashes_to_prune[h] }
end
##
# Liberal prune. Remove any _element_ from a bucket that is known to
# be a subnode of another node. Removed by identity.
def prune_liberally
update_masses
hashes_to_prune = Hash.new { |h,k| h[k] = [] }
# record each subtree by subhash, but skip if subtree mass > parent mass
self.hashes.values.each do |nodes|
nodes.each do |node|
tophash = node.structural_hash
topscore = self.masses[tophash]
node.deep_each do |subnode|
subhash = subnode.structural_hash
subscore = self.masses[subhash]
next if subscore and subscore > topscore
hashes_to_prune[subhash] << subnode
end
end
end
# nuke only individual items by object identity
self.hashes.each do |h,v|
v.delete_eql hashes_to_prune[h]
end
# nuke buckets we happened to fully empty
self.hashes.delete_if { |k,v| v.size <= 1 }
end
##
# Output an n-way diff from +data+. This is only used if --diff is
# given.
def n_way_diff *data
comments = []
codes = []
split_and_group(data).each do |subdata|
n = subdata.find_index { |s| s !~ /^#/ }
comment, code = subdata[0..n-1], subdata[n..-1]
comment = [] if n == 0
comments << comment
codes << code
end
comments = collapse_and_label pad_with_empty_strings comments
codes = collapse_and_label pad_with_empty_strings codes
(comments + codes).flatten.join("\n")
end
def split_and_group ary # :nodoc:
ary.each_with_index.map { |s, i|
c = (?A.ord + i).chr
s.scan(/^.*/).map { |s2|
s2.group = c
s2
}
}
end
def pad_with_empty_strings ary # :nodoc:
max = ary.map { |s| s.size }.max
ary.map { |a| a + ([""] * (max - a.size)) }
end
def collapse_and_label ary # :nodoc:
ary[0].zip(*ary[1..-1]).map { |lines|
if lines.uniq.size == 1 then
" #{lines.first}"
else
lines.reject { |l| l.empty? }.map { |l| "#{l.group}: #{l}" }
end
}
end
##
# Calculate summary scores on a per-file basis. For --summary.
def summary
score = Hash.new 0
masses.each do |hash, mass|
sexps = hashes[hash]
mass_per_file = mass.to_f / sexps.size
sexps.each do |sexp|
score[sexp.file] += mass_per_file
end
end
score
end
##
# Output the report. Duh.
def report io = $stdout
only = option[:only]
data = analyze only
io.puts "Total score (lower is better) = #{self.total}"
if option[:summary] then
io.puts
self.summary.sort_by { |f,v| [-v, f] }.each do |file, score|
io.puts "%8.2f: %s" % [score, file]
end
return
end
data.each_with_index do |item, count|
prefix = "%d) " % (count + 1) if option[:number]
match = item.identical? ? "IDENTICAL" : "Similar"
io.puts
io.puts "%s%s code found in %p (mass%s = %d)" %
[prefix, match, item.name, item.bonus, item.mass]
item.locations.each_with_index do |loc, i|
loc_prefix = "%s: " % (?A.ord + i).chr if option[:diff]
extra = " (FUZZY)" if loc.fuzzy?
io.puts " %s%s:%d%s" % [loc_prefix, loc.file, loc.line, extra]
end
if option[:diff] then
io.puts
nodes = hashes[item.structural_hash]
sources = nodes.map do |s|
msg = "sexp_to_#{File.extname(s.file).sub(/./, "")}"
self.respond_to?(msg) ? self.send(msg, s) : sexp_to_rb(s)
end
io.puts n_way_diff(*sources)
end
end
end
def sexp_to_rb sexp
begin
require "ruby2ruby"
rescue LoadError
return "ruby2ruby is required for diff"
end
@r2r ||= Ruby2Ruby.new
@r2r.process sexp.deep_clone
end
end
class String
attr_accessor :group # :nodoc:
end
class Sexp
##
# Whether or not this sexp is a mutated/modified sexp.
attr_accessor :modified
alias :modified? :modified # Is this sexp modified?
##
# Calculate the structural hash for this sexp. Cached, so don't
# modify the sexp afterwards and expect it to be correct.
def structural_hash
@structural_hash ||= pure_ruby_hash
end
##
# Returns a list of structural hashes for all nodes (and sub-nodes)
# of this sexp.
def all_structural_subhashes
hashes = []
self.deep_each do |node|
hashes << node.structural_hash
end
hashes
end
def initialize_copy o # :nodoc:
s = super
s.file = o.file
s.line = o.line
s.modified = o.modified
s
end
alias :[] :[] # needed for STRICT_SEXP
def [] a # :nodoc:
# TODO: figure out a way to make this STRICT_SEXP happy
s = super
if Sexp === s then
s.file = self.file
s.line = self.line
s.modified = self.modified
end
s
end
def + o # :nodoc:
self.dup.concat o
end
##
# Useful general array method that splits the array from 0..+n+ and
# the rest. Returns both sections.
def split_at n
return self[0..n], self[n+1..-1]
end
##
# Return the index of the last non-code element, or nil if this sexp
# is not a code-bearing node.
def code_index
{
:block => 0, # s(:block, *code)
:class => 2, # s(:class, name, super, *code)
:module => 1, # s(:module, name, *code)
:defn => 2, # s(:defn, name, args, *code)
:defs => 3, # s(:defs, recv, name, args, *code)
:iter => 2, # s(:iter, recv, args, *code)
}[self.sexp_type]
end
alias has_code? code_index # Does this sexp have a +*code+ section?
##
# Split the sexp into front-matter and code-matter, returning both.
# See #code_index.
def split_code
index = self.code_index
self.split_at index if index
end
end
class Sexp # straight from flay-persistent
NODE_NAMES = Hash.new { |h,k| h[k] = Zlib.crc32(k.to_s) }
MAX_INT32 = 2 ** 32 - 1 # :nodoc:
def pure_ruby_hash # :nodoc: see above
hash = 0
n = NODE_NAMES[sexp_type]
raise "Bad lookup: #{first} in #{sexp.inspect}" unless n
hash += n & MAX_INT32
hash += hash << 10 & MAX_INT32
hash ^= hash >> 6 & MAX_INT32
each do |o|
next unless Sexp === o
hash = hash + o.pure_ruby_hash & MAX_INT32
hash = (hash + (hash << 10)) & MAX_INT32
hash = (hash ^ (hash >> 6)) & MAX_INT32
end
hash = (hash + (hash << 3)) & MAX_INT32
hash = (hash ^ (hash >> 11)) & MAX_INT32
hash = (hash + (hash << 15)) & MAX_INT32
hash
end
end
class Array # :nodoc:
##
# Delete anything in +self+ if they are identical to anything in +other+.
def delete_eql other
self.delete_if { |o1| other.any? { |o2| o1.equal? o2 } }
end
end