lib/miga/cli/action/derep_wf.rb
# @package MiGA # @license Artistic-2.0 require 'miga/cli/action' class MiGA::Cli::Action::DerepWf < MiGA::Cli::Action require 'miga/cli/action/wf' include MiGA::Cli::Action::Wf def parse_cli default_opts_for_wf cli.defaults = { metric: :ani, threshold: 95.0, criterion: :quality, summaries: true, collection: true } cli.parse do |opt| opt.on( '--aai', 'Use Average Amino Acid Identity (AAI) as genome similarity metric', 'By default: Use Average Nucleotide Identity (ANI)' ) { cli[:metric] = :aai } opt.on( '--ani', 'Use Average Nucleotide Identity (ANI) as similarity metric (default)' ) { cli[:metric] = :ani } opt.on( '--threshold FLOAT', Float, "Metric threshold (%) to dereplicate. By default: #{cli[:threshold]}" ) { |v| cli[:threshold] = v } opt.on( '--quality', 'Use genome with highest quality as clade representatives (default)' ) { |v| cli[:criterion] = :quality } opt.on( '--medoids', 'Use medoids as clade representatives' ) { |v| cli[:criterion] = :medoids } opt.on( '--no-collection', 'Do not generate a dereplicated collection of assemblies' ) { |v| cli[:collection] = v } opt.on( '--no-summaries', 'Do not generate intermediate step summaries' ) { |v| cli[:summaries] = v } opts_for_wf_distances(opt) opts_for_wf(opt, 'Input genome assemblies (nucleotides, FastA)') end end def perform # Input data p = create_project( :assembly, { run_clades: false }, { run_mytaxa_scan: false, run_ssu: false } ) p.set_option(:gsp_metric, cli[:metric].to_s) p.set_option(:"gsp_#{cli[:metric]}", cli[:threshold]) # Run run_daemon dereplicate(p) summarize(%w[cds assembly essential_genes]) if cli[:summaries] cleanup end private def dereplicate(p) return if cli[:prepare_and_exit] cli.say 'Extracting genomospecies clades' r = p.result(:clade_finding) or raise 'Result unavailable: run failed' c_f = r.file_path(:clades_gsp) or raise 'Result incomplete: run failed' clades = File.readlines(c_f).map { |i| i.chomp.split("\t") } rep = representatives(p) File.open(File.join(cli[:outdir], 'genomospecies.tsv'), 'w') do |fh| fh.puts "Clade\tRepresentative\tMembers" clades.each_with_index do |i, k| fh.puts ["gsp_#{k + 1}", rep[k], i.join(',')].join("\t") end end if cli[:collection] dir = File.join(cli[:outdir], 'representatives') FileUtils.mkdir_p(dir) rep.each do |i| f = p.dataset(i).result(:assembly).file_path(:largecontigs) FileUtils.cp(f, dir) end end end def representatives(p) cli.say 'Identifying representatives' f = File.join(cli[:outdir], 'representatives.txt') if cli[:criterion] == :medoids FileUtils.cp(p.result(:clade_finding).file_path(:medoids_gsp), f) else src = File.join(MiGA::MiGA.root_path, 'utils/representatives.rb') MiGA::MiGA.run_cmd("ruby '#{src}' '#{p.path}' | cut -f 2", stdout: f) end File.readlines(f).map(&:chomp) end end