From b7482f364790a247f71cfb2d528ebd27ed81427e Mon Sep 17 00:00:00 2001
From: maldevel <maldevel@mail.com>
Date: Tue, 19 Apr 2016 13:20:42 +0300
Subject: [PATCH] release 1.0

release 1.0
---
 EmailHarvester.py | 238 +++++++++++++++++++++++++++++++++++++++++++++-
 README.md         |  49 ++++++++--
 requirements.txt  |   3 +
 3 files changed, 278 insertions(+), 12 deletions(-)
diff --git a/EmailHarvester.py b/EmailHarvester.py
index f599198..0f682cd 100644
--- a/EmailHarvester.py
+++ b/EmailHarvester.py
@@ -28,34 +28,264 @@
 __copyright__ = "Copyright (c) 2016 @maldevel"
 __credits__ = ["maldevel"]
 __license__ = "GPLv3"
-__version__ = "3.0"
+__version__ = "1.0"
 __maintainer__ = "maldevel"
 
 
 ################################
 import argparse
 import sys
+import time
+import requests
+import re
 
+from termcolor import colored
 from argparse import RawTextHelpFormatter
+from sys import platform as _platform
 ################################
 
 
+if _platform == 'win32':
+    import colorama
+    colorama.init()
+
+
+class myparser:
+    def __init__(self, results, word):
+            self.results = results
+            self.word = word
+            self.temp = []
+            
+    def genericClean(self):
+        self.results = re.sub('<KW>', '', self.results)
+        self.results = re.sub('</KW>', '', self.results)
+        self.results = re.sub('<title>', '', self.results)
+        self.results = re.sub('</div>', '', self.results)
+        self.results = re.sub('<p>', '', self.results)
+        self.results = re.sub('</span>', '', self.results)
+        self.results = re.sub('</a>', '', self.results)
+        self.results = re.sub('<em>', '', self.results)
+        self.results = re.sub('<b>', '', self.results)
+        self.results = re.sub('</b>', '', self.results)
+        self.results = re.sub('</em>', '', self.results)
+        self.results = re.sub('%2f', ' ', self.results)
+        self.results = re.sub('%3a', ' ', self.results)
+        self.results = re.sub('<strong>', '', self.results)
+        self.results = re.sub('</strong>', '', self.results)
+        #self.results = re.sub('>', '', self.results)
+        
+    def emails(self):
+        self.genericClean()
+        reg_emails = re.compile(
+            '[a-zA-Z0-9\.\-_]*' +
+            '@' +
+            '(?:[a-zA-Z0-9\.\-]*\.)?' +
+            self.word)
+        self.temp = reg_emails.findall(self.results)
+        emails = self.unique()
+        return emails
+    
+    def unique(self):
+        self.new = []
+        for x in self.temp:
+            if x not in self.new:
+                self.new.append(x)
+        return self.new
+    
+    
+###################################################################
+
+class SearchEngine:
+    def __init__(self, urlPattern, word, limit, counterInit, counterStep):
+        self.results = ""
+        self.totalresults = ""
+        self.userAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1"
+        self.limit = int(limit)
+        self.counter = int(counterInit)
+        self.urlPattern = urlPattern
+        self.step = int(counterStep)
+        self.word = word
+        
+    def do_search(self):
+        try:
+            urly = self.urlPattern.format(counter=str(self.counter), word=self.word)
+            headers = {
+                'User-Agent': self.userAgent,
+            }
+            r=requests.get(urly, headers=headers)
+        except Exception as e:
+            print(e)
+        self.results = r.content.decode(r.encoding)
+        self.totalresults += self.results
+    
+    def process(self):
+        while (self.counter < self.limit):
+            self.do_search()
+            time.sleep(1)
+            print(green("\tSearching " + str(self.counter) + " results..."))
+            self.counter += self.step
+            
+    def get_emails(self):
+        rawres = myparser(self.totalresults, self.word)
+        return rawres.emails()    
+    
+###################################################################
+
+def yellow(text):
+    return colored(text, 'yellow', attrs=['bold'])
+
+def green(text):
+    return colored(text, 'green', attrs=['bold'])
+
+def blue(text):
+    return colored(text, 'blue', attrs=['bold'])
+
+def red(text):
+    return colored(text, 'red', attrs=['bold'])
+
+def unique(data):
+        unique = []
+        for x in data:
+            if x not in unique:
+                unique.append(x)
+        return unique
+    
+###################################################################
+
+def limit_type(x):
+    x = int(x)
+    if x <= 0:
+        raise argparse.ArgumentTypeError("Minimum results limit is 1.")
+    return x
+
+def engine_type(x):
+    if x not in ("google", "bing", "yahoo", "ask", "all"):
+        raise argparse.ArgumentTypeError("Invalid search engine, try with: google, bing, yahoo, ask, all.")
+    return x
+
+
 if __name__ == '__main__':
 
     parser = argparse.ArgumentParser(description="""
-    
+
  _____                   _  _   _   _                                _              
 |  ___|                 (_)| | | | | |                              | |             
 | |__  _ __ ___    __ _  _ | | | |_| |  __ _  _ __ __   __ ___  ___ | |_  ___  _ __ 
 |  __|| '_ ` _ \  / _` || || | |  _  | / _` || '__|\ \ / // _ \/ __|| __|/ _ \| '__|
 | |___| | | | | || (_| || || | | | | || (_| || |    \ V /|  __/\__ \| |_|  __/| |   
 \____/|_| |_| |_| \__,_||_||_| \_| |_/ \__,_||_|     \_/  \___||___/ \__|\___||_| 
-        
-""",                                 
+
+    A tool to retrieve Domain email addresses from Search Engines | @maldevel
+                                 {}: {}
+""".format(red('Version'), yellow(__version__)),                                 
                                      formatter_class=RawTextHelpFormatter)
     
+    parser.add_argument("-d", '--domain', metavar='DOMAIN', dest='domain', type=str, help="Domain to search.")
+    parser.add_argument("-s", '--save', metavar='FILE', dest='filename', type=str, help="Save the results into a TXT and XML file.")
+    parser.add_argument("-e", '--engine', metavar='ENGINE', dest='engine', default="all", type=engine_type, help="Select search engine(google, bing, yahoo, ask, all).")
+    parser.add_argument("-l", '--limit', metavar='LIMIT', dest='limit', type=limit_type, default=100, help="Limit the number of results.")
+    
     if len(sys.argv) is 1:
         parser.print_help()
         sys.exit()
         
     args = parser.parse_args()
+    
+    domain = ""
+    if(args.domain):
+        domain = args.domain 
+    else:
+        print('[{}] {}'.format(red('ERROR'), "Please specify a domain name to search."))
+        sys.exit(2)
+        
+    filename = ""
+    if(args.filename):
+        filename = args.filename
+        
+    limit = args.limit        
+    engine = args.engine 
+
+
+    if engine == "google":
+        print(green("[-] Searching in Google..\n"))
+        search = SearchEngine("http://www.google.com/search?num=100&start={counter}&hl=en&q=%40\"{word}\"", domain, limit, 0, 100)
+        search.process()
+        all_emails = search.get_emails()
+        
+    elif engine == "bing":
+        print(green("[-] Searching in Bing..\n"))
+        search = SearchEngine("http://www.bing.com/search?q=%40{word}&count=50&first={counter}", domain, limit, 0, 50)
+        search.process()
+        all_emails = search.get_emails()
+        
+    elif engine == "ask":
+        print(green("[-] Searching in ASK..\n"))
+        search = SearchEngine("http://www.ask.com/web?q=%40{word}", domain, limit, 0, 100)
+        search.process()
+        all_emails = search.get_emails()
+        
+    elif engine == "yahoo":
+        print(green("[-] Searching in Yahoo..\n"))
+        search = SearchEngine("http://search.yahoo.com/search?p=%40{word}&n=100&ei=UTF-8&va_vt=any&vo_vt=any&ve_vt=any&vp_vt=any&vd=all&vst=0&vf=all&vm=p&fl=0&fr=yfp-t-152&xargs=0&pstart=1&b={counter}", domain, limit, 1, 100)
+        search.process()
+        all_emails = search.get_emails()
+        
+    elif engine == "all":
+        print(green("[-] Searching everywhere..\n"))
+        all_emails = []
+        print(green("[-] Searching in Google..\n"))
+        search = SearchEngine("http://www.google.com/search?num=100&start={counter}&hl=en&q=%40\"{word}\"", domain, limit, 0, 100)
+        search.process()
+        all_emails.extend(search.get_emails())
+        print(green("\n[-] Searching in Bing..\n"))
+        search = SearchEngine("http://www.bing.com/search?q=%40{word}&count=50&first={counter}", domain, limit, 0, 50)
+        search.process()
+        all_emails.extend(search.get_emails())
+        print(green("\n[-] Searching in ASK..\n"))
+        search = SearchEngine("http://www.ask.com/web?q=%40{word}", domain, limit, 0, 100)
+        search.process()
+        all_emails.extend(search.get_emails())
+        print(green("\n[-] Searching in Yahoo..\n"))
+        search = SearchEngine("http://search.yahoo.com/search?p=%40{word}&n=100&ei=UTF-8&va_vt=any&vo_vt=any&ve_vt=any&vp_vt=any&vd=all&vst=0&vf=all&vm=p&fl=0&fr=yfp-t-152&xargs=0&pstart=1&b={counter}", domain, limit, 1, 100)
+        search.process()
+        all_emails.extend(search.get_emails())
+        all_emails = unique(all_emails)
+    
+    print(green("\n\n[+] Emails found:"))
+    print(green("------------------"))
+    
+    if all_emails == []:
+        print(red("No emails found"))
+        sys.exit(3)
+    else:
+        for emails in all_emails:
+            print(emails)
+            
+    if filename != "":
+        try:
+            print(green("[+] Saving files..."))
+            file = open(filename, 'w')
+            for email in all_emails:
+                try:
+                    file.write(email + "\n")
+                except:
+                    print(red("Exception " + email))
+                    pass
+            file.close
+        except Exception as e:
+            print(red("Error saving CSV file: " + e))
+            
+        try:
+            filename = filename.split(".")[0] + ".xml"
+            file = open(filename, 'w')
+            file.write('<?xml version="1.0" encoding="UTF-8"?><EmailHarvester>')
+            for x in all_emails:
+                file.write('<email>' + x + '</email>')
+            file.write('</EmailHarvester>')
+            file.flush()
+            file.close()
+            print(green("Files saved!"))
+        except Exception as er:
+            print(red("Error saving XML file: " + er))
+            
+        sys.exit()
\ No newline at end of file
diff --git a/README.md b/README.md
index 3f80747..9254429 100644
--- a/README.md
+++ b/README.md
@@ -2,35 +2,68 @@ EmailHarvester
 ====
 * A tool to retrieve Domain email addresses from Search Engines
 
+This project was inspired by:
+* theHarvester(https://github.com/laramies/theHarvester) from laramies.
+* search_email_collector(https://github.com/rapid7/metasploit-framework/blob/master/modules/auxiliary/gather/search_email_collector.rb) from Carlos Perez.
+
 
 Requirements
 =====
 * Python 3.x
+* termcolor
+* colorama
+* requests
 
 
 Features
 =====
-
+* Retrieve Domain email addresses from Search Engines
+* Google
+* Bing
+* Yahoo
+* ASK
 
 Download/Installation
 ====
 * git clone https://github.com/maldevel/EmailHarvester
+* pip install -r requirements.txt --user
 
 
-Setup
+Usage
 =====
+```
+usage: EmailHarvester.py [-h] [-d DOMAIN] [-s FILE] [-e ENGINE] [-l LIMIT]
 
+ _____                   _  _   _   _                                _
+|  ___|                 (_)| | | | | |                              | |
+| |__  _ __ ___    __ _  _ | | | |_| |  __ _  _ __ __   __ ___  ___ | |_  ___  _ __
+|  __|| '_ ` _ \  / _` || || | |  _  | / _` || '__|\ \ / // _ \/ __|| __|/ _ \| '__|
+| |___| | | | | || (_| || || | | | | || (_| || |    \ V /|  __/\__ \| |_|  __/| |
+\____/|_| |_| |_| \__,_||_||_| \_| |_/ \__,_||_|     \_/  \___||___/ \__|\___||_|
 
-Contents
-=====
-
+    A tool to retrieve Domain email addresses from Search Engines | @maldevel
+                                 Version: 1.0
 
-Usage
-=====
-```
+optional arguments:
+  -h, --help            show this help message and exit
+  -d DOMAIN, --domain DOMAIN
+                        Domain to search.
+  -s FILE, --save FILE  Save the results into a TXT and XML file.
+  -e ENGINE, --engine ENGINE
+                        Select search engine(google, bing, yahoo, ask, all).
+  -l LIMIT, --limit LIMIT
+                        Limit the number of results.
 ```
 
 
 Examples
 =====
+* Search in Google
+./EmailHarvester.py -d example.com -e google
+
+* Search in all engines
+./EmailHarvester.py -d example.com -e all
+
+* Limit results 
+./EmailHarvester.py -d example.com -e all -l 200
 
diff --git a/requirements.txt b/requirements.txt
index e69de29..63a54a1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,3 @@
+termcolor
+colorama
+requests
\ No newline at end of file