Perl E-mail Harvester

For the discussion of Perl, Python, Ruby, and PHP and other interpreted languages.

Perl E-mail Harvester

Post by Muskelmann098 on Thu Jun 02, 2011 5:54 pm
([msg=58053]see Perl E-mail Harvester[/msg])

Hey,

This is a project that I began about a year ago when i started learning Perl. Since then, it has been rotting in a folder until today, when I fixed up some of the code, commented it out, and so on.

The web crawler will start from whatever URL you feed it. After that, you can grab a cup of coffee and sit back, because it will run until there are no more links left to explore on the World Wide Web... or at least that's the idea ;)
It scans each web page for e-mail addresses and puts them in a neat little file for you (because why the fuck not?). I'm unsure how useful it is unless you sell e-mail lists, but I'm posting it mostly so that people who know better can critique my code. And who knows? Maybe someone here will find a use for it.

CODE:
Code: Select all
#!C:\Perl\bin\perl

use strict;
use LWP::Simple;
use LWP::UserAgent;
use HTTP::Request;
use HTTP::Response;
use HTML::LinkExtor;
use URI::http;
use Time::HiRes qw(usleep);
use Getopt::Std;

##########   VARIABLE DECLARATION    ############
our ($opt_u,$opt_d);
getopts('u:d');
my %visited;
my %oldemail;
my @urls;
my $linkfile;
my $emailfile;
my $oldemailcount = 0;
my $curtime = localtime(time);
my $firstmail = 0;
my $domainonly = $opt_d;
my $startingURL = $opt_u;
my $domain;
my $sleep;
##########    -------------------    #############


# Loads already found e-mails from a database file IN THE SAME DIRECTORY AS THIS PROGRAM
loadEmails();

# If user did not use flags, show them the configuration settings
# If they DID use the flags, make sure that the URL is correctly formatted
if(!$opt_u){
   getInput();
}else{
   sanitizeURL($startingURL);
}

# Add the starting URL selected by user into the array containing all URLs to crawl
push @urls, $startingURL;

# Initialize LWP browser
my $browser = LWP::UserAgent->new();
$browser->timeout(10);
$browser->agent("Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.0; Win64; x64; Trident/4.0)");

########### -- STARTING THE MAIN LOOP -- ##########
MAIN: while (@urls) {
   # Set the number of milliseconds to sleep between each page crawled.
   # This is randomized to a value between 12 and 17 seconds by default
   $sleep = rand(500000) + 12000000;

   # Draw a URL and remove it from the array
   my $url = shift @urls;

   select(STDOUT);
   next if $visited{$url};
   print "\n\n------------ CHECKING -> $url <- ----------\n\n";
   
   # Send request to web server via the browser
   my $request = HTTP::Request->new(GET => $url);
   my $response = $browser->request($request);
   
   # If the server responds with an error...
   if ($response->is_error()) {
      select(STDOUT);
      # print the error message from server
      print $response->status_line, "\n";
      handleErrors();
   }
   # Extract HTML from HTTP response
   my $contents = $response->content();
   print "\nLinks Found:\n";
   # add link crawled to a hash of visited links.
   $visited{$url} = 1;
   # extract all links from HTML
   my ($page_parser) = HTML::LinkExtor->new(undef, $url);
   $page_parser->parse($contents)->eof;
   my @links = $page_parser->links;
   # If there are no more links, call the handleErrors() function
   if(!@links){
      handleErrors();
   }
   
   # For every link in the links array...
   foreach my $link (@links) {
      # check for illegal file extensions
      if($$link[2]!~ m/.png/i and $$link[2]!~ m/.css/i and $$link[2]!~ m/.ico/i and $$link[2]!~ m/.jpg/i
      and $$link[2]!~ m/.js/i and $$link[2]!~ m/.xml/i and $$link[2]!~ m/.gif/i and $$link[2]!~ m/javascript:(.)/i
      and $$link[2]!~ m/feeds./i and $$link[2]!~ m/rss./i and $$link[2]!~ m/mailto:/i and $$link[2]!~ m/about:./i
      and $$link[2]!~ m/.ashx/i){
         # If the option is set, print only the links from the same domain as the starting URL.
         # Else, do the same for all links found.
         if($domainonly == 1){
            if($$link[2] =~ m/$domain/ig){
               select(STDOUT);
               if($visited{$$link[2]}){
               }else{
                  print "$$link[2]\n";
                  #Push links found on page into the array of URLs
                  push @urls, $$link[2];
               }
            }
         }else{
            select(STDOUT);
            if($visited{$$link[2]}){}
            else{
               print "$$link[2]\n";
               push @urls, $$link[2];
            }
         }
      }
   }
   
   # Print all e-mail adresses found...
   print "\nEmails Found:\n";
   # ...that matches these regexes.
   while($contents =~ m/\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b/ig or $contents =~ m/\b[A-Z0-9._%+-]+\[+AT+\][A-Z0-9.-]+\.[A-Z]{2,4}\b/ig){
      # only add e-mail to list if it has not already been printed. (No Duplicates)
      next if $oldemail{$&};
      # unless the address matches any of these unwanted addresses..
      if($& =~ m/example.com/ig or $& =~ m/spam/ig or $& =~ m/xxx/ig){
      }else{
         
         # print to both the email database file and the console.
         select(STDOUT);
         print $&."\n";
         open $emailfile,">>", ("emails.txt");
         select($emailfile);
         if($firstmail == 0){
            printf "\n-------------------- $curtime --------------------\n";
            $firstmail = 1;
         }
         print $&."\n";
         close $emailfile;
         $oldemail{$&} = 1;
      }
   }
   # The loop is almost done, upon a successful crawl through a link, it will sleep for the amount of time
   # set at the start of the loop.
   select(STDOUT);
   print "\nProgram waits for ". $sleep/1000000 ." seconds before next request.\nThis is to prevent blacklisting.\n";
   usleep($sleep);
}

sub loadEmails {
   print "Loading E-mails from file 'emails.txt'...\n\n";
   open $emailfile,"+<", ("emails.txt") or print "E-mail file does not exist.\nIt will be created when you start crawling.\n\n";
   while(<$emailfile>){
      chomp($_);
      $oldemail{$_} = 1;
      if($_ =~ m/\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b/ig or $_ =~ m/\b[A-Z0-9._%+-]+\[+AT+\][A-Z0-9.-]+\.[A-Z]{2,4}\b/ig){
         $oldemailcount++;
      }
      print "$_\n";
   }
   close($emailfile);
   print "\n$oldemailcount E-Mails Loaded!\n";
}
########## -- END OF MAIN LOOP -- ##########

# Function to be called if user does not use the terminal flags or there is a mistake in the URL
sub getInput {
   # Get the URL to start crawling
   print "\nPlease enter a URL to start crawling. \n(Example: 'http://google.com' or 'yahoo.com')\n\n";
   print "http://";
   $startingURL = <>;
   chomp $startingURL;
   
   # Format the URL and extract domainS
   $startingURL = sanitizeURL($startingURL);
   
   print "\n\nDo you want the links that appear to be only the ones in the same domain that you typed in?\n".
   "This is useful to avoid following links to advertising sites as these usually do not contain e-mail addresses\n\n".
   "Domain = $domain \n\n".
   "1) Yes\n".
   "2) No\n".
   "3) Exit Program\n\n";

   # Ask the user to select whether to use only link in the same domain or not.
   my $domainchoice;
   while($domainchoice != 1 && $domainchoice != 2 && $domainchoice != 3){
      $domainchoice = <>;
      chomp $domainchoice;
      if($domainchoice == 1){
         $domainonly = 1;
      }
      elsif($domainchoice == 3){
            exit;
      }
   }
}
############## ------------------------------------------- ################


############## -- Will correctly format a url string and extract its domain -- #############
sub sanitizeURL {
   my $url = @_[0];
   my $http = 'http://';
   
   # if there is no "http://" infront of the url, add it.
   if($url !~ m/http:./i && $url!~ m/https:./i){
      $url = $http.$url;
   }
   
   # extract domain of a URL string using the URI class.
   $domain = URI->new($url,"http");
   $domain = $domain->host;
   
   # return sanitized URL
   return $url;
}
############# -- Will handle errors by asking for a new URL to start over with if necessary" -- #############
sub handleErrors {
   # if there are no URLs left, ask for new starting URL, then continue.
   # else, sleep and continue
   if(scalar @urls < 1){
      print "No more URLs to crawl.\n";
      getInput();
      push @urls, $startingURL;
   }else{
      print "\nProgram waits for ". $sleep/1000000 ." seconds before next request.\nThis is to prevent blacklisting.\n";
      usleep($sleep);
   }
   redo MAIN;
}


Any comments would be appreciated. You can use it however you want (fix it up, spam people, kiss it, etc.) but what you do with it is your responsibility entirely...
Muskelmann098
Experienced User
Experienced User
 
Posts: 78
Joined: Mon Feb 02, 2009 9:39 am
Blog: View Blog (0)


Return to Interpreted Languages

Who is online

Users browsing this forum: No registered users and 0 guests