class MiGA::Cli::Action::TaxDist
def cannid(a, b)
def cannid(a, b) (a > b ? [b, a] : [a, b]).join('-') end
def get_tab_index(dir)
def get_tab_index(dir) if cli[:index].nil? ds = cli.load_and_filter_datasets ds.keep_if { |d| !d.metadata[:tax].nil? } cli.say 'Indexing taxonomy' tax_index = TaxIndex.new ds.each { |d| tax_index << d } tab = File.expand_path('index.tab', dir) File.open(tab, 'w') { |fh| fh.print tax_index.to_tab } else tab = cli[:index] end tab end
def parse_cli
def parse_cli cli.parse do |opt| cli.opt_object(opt, [:project]) cli.opt_filter_datasets(opt) opt.on( '-i', '--index FILE', 'Pre-calculated tax-index (in tabular format) to be used', 'If passed, dataset filtering arguments are ignored' ) { |v| cli[:index] = v } opt.on( '-m', '--metric STR', 'Distance metric used to evaluate the distribution', 'By default: AAI for genomes projects, ANI for clade projects' ) { |v| cli[:metric] = v.downcase } end end
def perform
def perform dist = read_distances Dir.mktmpdir do |dir| tab = get_tab_index(dir) dist = traverse_taxonomy(tab, dist) end cli.say 'Generating report' dist.keys.each do |k| dist[k][5] = dist[k][4].reverse.join(' ') dist[k][4] = dist[k][4].first puts (k.split('-') + dist[k]).join("\t") end end
def read_distances
def read_distances p = cli.load_project cli[:metric] ||= p.clade? ? 'ani' : 'aai' res_n = "#{cli[:metric]}_distances" cli.say "Reading distances: 1-#{cli[:metric].upcase}" res = p.result(res_n) raise "#{res_n} not yet calculated" if res.nil? matrix = res.file_path(:matrix) raise "#{res_n} has no matrix" if matrix.nil? dist = {} mfh = (matrix =~ /\.gz$/) ? Zlib::GzipReader.open(matrix) : File.open(matrix, 'r') mfh.each_line do |ln| next if mfh.lineno == 1 row = ln.chomp.split("\t") dist[cannid(row[1], row[2])] = [row[3], row[5], row[6], 0, ['root:biota']] cli.advance('Lines:', mfh.lineno, nil, false) if (mfh.lineno % 1_000) == 0 end cli.say '' mfh.close dist end
def traverse_taxonomy(tab, dist)
def traverse_taxonomy(tab, dist) cli.say 'Traversing taxonomy' rank_i = 0 Taxonomy.KNOWN_RANKS.each do |rank| next if rank == :ns rank_n = 0 rank_i += 1 in_rank = nil ds_name = [] File.open(tab, 'r') do |fh| fh.each_line do |ln| if ln =~ /^ {0,#{(rank_i - 1) * 2}}\S+:\S+:/ in_rank = nil ds_name = [] elsif ln =~ /^ {#{rank_i * 2}}(#{rank}:(\S+)):/ in_rank = $2 == '?' ? nil : $1 ds_name = [] elsif ln =~ /^ *# (\S+)/ && !in_rank.nil? ds_i = $1 ds_name << ds_i ds_name.each do |ds_j| k = cannid(ds_i, ds_j) next if dist[k].nil? rank_n += 1 dist[k][3] = rank_i dist[k][4].unshift in_rank end end end end cli.say "o #{rank}: #{rank_n} pairs of datasets" end dist end