Learn more  » Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

vistahigherlearning / logstash   deb

Repository URL to install this package:

/ opt / logstash / vendor / bundle / jruby / 1.9 / gems / mail-2.5.3 / lib / tasks / corpus.rake

namespace :corpus do

  task :load_mail do
    require File.expand_path('../../../spec/environment')
    require 'mail'
  end

  # Used to run parsing against an arbitrary corpus of email.
  # For example: http://plg.uwaterloo.ca/~gvcormac/treccorpus/
  desc "Provide a LOCATION=/some/dir to verify parsing in bulk, otherwise defaults"
  task :verify_all => :load_mail do

    root_of_corpus    = ENV['LOCATION'] || 'corpus/spam'
    @save_failures_to = ENV['SAVE_TO']  || 'spec/fixtures/emails/failed_emails'
    @failed_emails    = []
    @checked_count    = 0

    if root_of_corpus
      root_of_corpus = File.expand_path(root_of_corpus)
      if not File.directory?(root_of_corpus)
        raise "\n\tPath '#{root_of_corpus}' is not a directory.\n\n"
      end
    else
      raise "\n\tSupply path to corpus: LOCATION=/path/to/corpus\n\n"
    end

    if @save_failures_to
      if not File.directory?(@save_failures_to)
        raise "\n\tPath '#{@save_failures_to}' is not a directory.\n\n"
      end
      @save_failures_to = File.expand_path(@save_failures_to)
      puts "Mail which fails to parse will be saved in '#{@save_failures_to}'"
    end

    puts "Checking '#{root_of_corpus}' directory (recursively)"

    # we're tracking all the errors separately, don't clutter terminal
    $stderr_backup = $stderr.dup
    $stderr.reopen("/dev/null", "w")
    STDERR = $stderr

    dir_node(root_of_corpus)

    # put our toys back now that we're done with them
    $stderr = $stderr_backup.dup
    STDERR = $stderr

    puts "\n\n"
        
    if @failed_emails.any?
      report_failures_to_stdout
    end
    puts "Out of Total: #{@checked_count}"

    if @save_failures_to
      puts "Add SAVE_TO=/some/dir to save failed emails to for review.,"
      puts "May result in a lot of saved files. Do a dry run first!\n\n"
    else
      puts "There are no errors"
    end
  end

  def dir_node(path)
    puts "\n\n"
    puts "Checking emails in '#{path}':"

    entries = Dir.entries(path)

    entries.each do |entry|
      next if ['.', '..'].include?(entry)
      full_path = File.join(path, entry)

      if File.file?(full_path)
        file_node(full_path)
      elsif File.directory?(full_path)
        dir_node(full_path)
      end
    end
  end
  
  def file_node(path)
    verify(path)
  end
  
  def verify(path)
    result, message = parse_as_mail(path)
    if result
      print '.'
      $stdout.flush
    else
      save_failure(path, message)
      print 'x'
    end
  end

  def save_failure(path, message)
    @failed_emails << [path, message]
    if @save_failures_to
      email_basename = File.basename(path)
      failure_as_filename = message.gsub(/\W/, '_')
      new_email_name = [failure_as_filename, email_basename].join("_")
      File.open(File.join(@save_failures_to, new_email_name), 'w+') do |fh|
        fh << File.read(path)
      end 
    end
  end

  def parse_as_mail(path)
    @checked_count += 1
    begin
      parsed_mail = Mail.read(path)
      [true, nil]
    rescue => e
      [false, e.message]
    end
  end
  
  def report_failures_to_stdout
    @failed_emails.each do |failed|
      puts "#{failed[0]} : #{failed[1]}"
    end
    puts "Failed: #{@failed_emails.size}"
  end
  
end