Skip to content

Commit

Permalink
Merge pull request #19 from ecoron/0.9.1
Browse files Browse the repository at this point in the history
0.9.1
  • Loading branch information
Ronald Schmidt authored Sep 14, 2017
2 parents a73a7be + caf8cc3 commit 963a7e1
Show file tree
Hide file tree
Showing 9 changed files with 119 additions and 40 deletions.
2 changes: 1 addition & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@
# The short X.Y version.
version = '0.9'
# The full version, including alpha/beta/rc tags.
release = '0.9.0'
release = '0.9.1'

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
Expand Down
5 changes: 4 additions & 1 deletion docs/configuration.rst
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,10 @@ don't customize this setting, the default is used.
Proxy file
----------
You can provide a list of proxys which should used for scraping the search engines.
This feature works not stable in versions <= 0.9.1, if you use more then one worker
and have more then one proxy in your file.
You can provide a list of proxies which should used for scraping the search engines.
For this you have to create a proxy_file and to set the path to the file in the configuration.
The proxy_file should look like this
Expand Down
31 changes: 31 additions & 0 deletions docs/examples.rst
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,35 @@ custom path to the binary.
if 'serp_title' in result and len(result['serp_title']) > 1:
print(result['serp_title'])
Using Chrome
------------

.. code-block:: bash
python examples\example_chrome.py
It is possible to use Chrome, but we recomment PhantomJs, which is installed by default.
For using Chrome u need to download the latest `chromedriver`_ and to set the executable_path.

.. code-block:: bash
import pprint
import serpscrap
keywords = ['berlin']
config = serpscrap.Config()
config.set('sel_browser', 'chrome')
config.set('executable_path', '/tmp/chromedriver_win32/chromedriver.exe')
scrap = serpscrap.SerpScrap()
scrap.init(config=config.get(), keywords=keywords)
results = scrap.run()
for result in results:
pprint.pprint(result)
print()
Image search
------------
Expand Down Expand Up @@ -157,3 +186,5 @@ References
.. _`examples`: https://github.com/ecoron/SerpScrap/tree/master/examples
.. _`example_simple.py`: https://github.com/ecoron/SerpScrap/blob/master/examples/example_simple.py
.. _`example_related.py`: https://github.com/ecoron/SerpScrap/blob/master/examples/example_related.py
.. _`chromedriver`: https://sites.google.com/a/chromium.org/chromedriver/downloads
18 changes: 18 additions & 0 deletions examples/example_chrome.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import pprint
import serpscrap

keywords = ['berlin']

config = serpscrap.Config()
config.set('sel_browser', 'chrome')
config.set('executable_path', '/tmp/chromedriver_win32/chromedriver.exe')

scrap = serpscrap.SerpScrap()
scrap.init(config=config.get(), keywords=keywords)
results = scrap.run()

for result in results:
pprint.pprint(result)
print()
11 changes: 6 additions & 5 deletions scrapcore/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import queue
import threading

from random import shuffle
from scrapcore.cachemanager import CacheManager
from scrapcore.database import ScraperSearch
from scrapcore.database import get_session, fixtures
Expand Down Expand Up @@ -78,6 +79,7 @@ def main(self, return_results=False, config=None):

if not proxies:
raise Exception('''No proxies available. Turning down.''')
shuffle(proxies)

# get a scoped sqlalchemy session
session_cls = get_session(config, scoped=True)
Expand Down Expand Up @@ -122,10 +124,10 @@ def main(self, return_results=False, config=None):
self.logger.info('''
Going to scrape {num_keywords} keywords with {num_proxies}
proxies by using {num_threads} threads.'''.format(
num_keywords=len(list(scrape_jobs)),
num_proxies=len(proxies),
num_threads=num_search_engines)
)
num_keywords=len(list(scrape_jobs)),
num_proxies=len(proxies),
num_threads=num_search_engines)
)

progress_thread = None

Expand All @@ -139,7 +141,6 @@ def main(self, return_results=False, config=None):
for search_engine in search_engines:

for proxy in proxies:

for worker in range(num_workers):
num_worker += 1
workers.put(
Expand Down
81 changes: 52 additions & 29 deletions scrapcore/scraper/selenium.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,6 @@ class SelScrape(SearchEngineScrape, threading.Thread):
'yahoo': '.compPagination .next',
'baidu': '.n',
'ask': '#paging div a.txt3.l_nu',
'blekko': '',
'duckduckgo': '',
'googleimg': '#pnnext',
'baiduimg': '.n',
Expand All @@ -74,7 +73,6 @@ class SelScrape(SearchEngineScrape, threading.Thread):
'baidu': (By.NAME, 'wd'),
'duckduckgo': (By.NAME, 'q'),
'ask': (By.NAME, 'q'),
'blekko': (By.NAME, 'q'),
'google': (By.NAME, 'q'),
'googleimg': (By.NAME, 'as_q'),
'baiduimg': (By.NAME, 'word'),
Expand Down Expand Up @@ -102,7 +100,6 @@ class SelScrape(SearchEngineScrape, threading.Thread):
'baidu': 'http://baidu.com/',
'duckduckgo': 'https://duckduckgo.com/',
'ask': 'http://ask.com/',
'blekko': 'http://blekko.com/',
}

image_search_locations = {
Expand All @@ -113,7 +110,6 @@ class SelScrape(SearchEngineScrape, threading.Thread):
'baidu': 'http://image.baidu.com/',
'duckduckgo': None, # duckduckgo doesnt't support direct image search
'ask': 'http://www.ask.com/pictures/',
'blekko': None,
'googleimg': 'https://www.google.com/advanced_image_search',
'baiduimg': 'http://image.baidu.com/',
}
Expand Down Expand Up @@ -168,6 +164,7 @@ def proxy_check(self, proxy):

try:
self.webdriver.get(self.config.get('proxy_info_url'))
time.sleep(2)
try:
text = re.search(
r'(\{.*?\})',
Expand Down Expand Up @@ -211,7 +208,10 @@ def _save_debug_screenshot(self):
str(self.page_number),
)
)
self.webdriver.get_screenshot_as_file(location)
try:
self.webdriver.get_screenshot_as_file(location)
except (ConnectionError, ConnectionRefusedError, ConnectionResetError) as err:
logger.error(err)

def _set_xvfb_display(self):
# TODO: should we check the format of the config?
Expand All @@ -237,6 +237,7 @@ def _get_webdriver(self):

def _get_Chrome(self):
try:
chrome_ops = webdriver.ChromeOptions()
if self.proxy:
chrome_ops = webdriver.ChromeOptions()
chrome_ops.add_argument(
Expand All @@ -247,13 +248,28 @@ def _get_Chrome(self):
)
)
self.webdriver = webdriver.Chrome(
executable_path=self.config['executebale_path'],
executable_path=self.config['executable_path'],
chrome_options=chrome_ops
)
else:
self.webdriver = webdriver.Chrome(
executable_path=self.config['executable_path']

chrome_ops.add_argument('--no-sandbox')
chrome_ops.add_argument('--start-maximized')
chrome_ops.add_argument(
'--window-position={},{}'.format(
randint(10, 30),
randint(10, 30)
)
)
chrome_ops.add_argument(
'--window-size={},{}'.format(
randint(800, 1024),
randint(600, 900)
)
)
self.webdriver = webdriver.Chrome(
executable_path=self.config['executable_path'],
chrome_options=chrome_ops
)
return True
except WebDriverException:
raise
Expand Down Expand Up @@ -326,12 +342,16 @@ def _get_PhantomJS(self):
logger.info('useragent: {}'.format(useragent))
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = useragent
self.webdriver = webdriver.PhantomJS(
executable_path=self.config['executable_path'],
service_args=service_args,
desired_capabilities=dcap
)
return True
try:
self.webdriver = webdriver.PhantomJS(
executable_path=self.config['executable_path'],
service_args=service_args,
desired_capabilities=dcap
)
return True
except (ConnectionError, ConnectionRefusedError, ConnectionResetError) as err:
logger.error(err)
return False
except WebDriverException as e:
logger.error(e)
return False
Expand Down Expand Up @@ -472,7 +492,7 @@ def _goto_next_page(self):
element.click()
except WebDriverException:
# See http://stackoverflow.com/questions/11908249/debugging-element-is-not-clickable-at-point-error
# first move mouse to the next element, some times the element is not visibility, like blekko.com
# first move mouse to the next element, some times the element is not visibility
selector = self.next_page_selectors[self.search_engine_name]
if selector:
try:
Expand Down Expand Up @@ -550,7 +570,7 @@ def wait_until_serp_loaded(self):
elif self.search_engine_name == 'ask':
selector = '#paging .pgcsel .pg'

content = None
# content = None
try:
time.sleep(1)
WebDriverWait(self.webdriver, 5).until(
Expand All @@ -562,7 +582,7 @@ def wait_until_serp_loaded(self):
except TimeoutException:
self._save_debug_screenshot()
try:
content = self.webdriver.find_element_by_css_selector(selector).text
self.webdriver.find_element_by_css_selector(selector).text
except NoSuchElementException:
logger.error('Skipp it, no such element - SeleniumSearchError')
raise SeleniumSearchError('Stop Scraping, seems we are blocked')
Expand Down Expand Up @@ -614,7 +634,9 @@ def search(self):
if self.search_param_fields:
wait_res = self._wait_until_search_param_fields_appears()
if wait_res is False:
self.quit()
raise Exception('Waiting search param input fields time exceeds')

for param, field in self.search_param_fields.items():
if field[0] == By.ID:
js_tpl = '''
Expand All @@ -635,7 +657,11 @@ def search(self):
self.search_input.send_keys(self.query + Keys.ENTER)
except ElementNotVisibleException:
time.sleep(2)
self.search_input.send_keys(self.query + Keys.ENTER)
try:
self.search_input.send_keys(self.query + Keys.ENTER)
except Exception:
logger.error('send keys not possible, maybe page cannot loaded')
self.quit()
except Exception:
logger.error('send keys not possible')
pass
Expand All @@ -656,6 +682,8 @@ def search(self):
self._save_debug_screenshot()
time.sleep(.5)
self.html = self.webdriver.execute_script('return document.body.innerHTML;')
except (ConnectionError, ConnectionRefusedError, ConnectionResetError) as err:
logger.error(err)
except WebDriverException:
self.html = self.webdriver.page_source

Expand Down Expand Up @@ -707,8 +735,11 @@ def run(self):
self.build_search()
self.search()

if self.webdriver:
self.webdriver.quit()
self.quit()

def quit(self):
if self.webdriver:
self.webdriver.quit()


"""
Expand Down Expand Up @@ -754,14 +785,6 @@ def wait_until_serp_loaded(self):
super()._wait_until_search_input_field_appears()


class BlekkoSelScrape(SelScrape):
def __init__(self, *args, **kwargs):
SelScrape.__init__(self, *args, **kwargs)

def _goto_next_page(self):
pass


class AskSelScrape(SelScrape):
def __init__(self, *args, **kwargs):
SelScrape.__init__(self, *args, **kwargs)
Expand Down
7 changes: 5 additions & 2 deletions scrapcore/scraping.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,5 +308,8 @@ def update_proxy_status(self, status, ipinfo=None, online=True):
proxy.status = status
proxy.online = online

self.session.add(proxy)
self.session.commit()
try:
self.session.merge(proxy, load=True)
self.session.commit()
except:
pass
2 changes: 1 addition & 1 deletion serpscrap/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ class Config():
},
'proxy_file': '',
'proxy_check_url': 'http://canihazip.com/s',
'proxy_info_url': 'http://ipinfo.io/json',
'proxy_info_url': 'https://ipinfo.io/json',
'stop_on_detection': True,
'today': datetime.datetime.strftime(
datetime.datetime.utcnow(),
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# -*- coding: utf-8 -*-
from setuptools import setup, find_packages

version = '0.9.0'
version = '0.9.1'


setup(
Expand Down

0 comments on commit 963a7e1

Please sign in to comment.