Merge pull request #8 from ecoron/0.5.2

0.5.2
ecoron · May 7, 2017 · 07ec85c · 07ec85c
2 parents 9712c54 + f187358
commit 07ec85c
Show file tree

Hide file tree

Showing 19 changed files with 171 additions and 479 deletions.
diff --git a/docs/conf.py b/docs/conf.py
@@ -58,9 +58,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '0.5'
+version = '0.6'
 # The full version, including alpha/beta/rc tags.
-release = '0.5.1'
+release = '0.6.0'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/docs/configuration.rst b/docs/configuration.rst
@@ -20,7 +20,8 @@ Default configuration
 * search_engines: ['google']      - search engines (google)
 * url_threads: 3                  - number of threads if scrape_urls is true
 * use_own_ip: True                - if using proxies set to False
-
+* sleeping_min: 5                 - min seconds to sleep between scrapes
+* sleeping_max: 15                - max seconds to sleep between scrapes
 
 Custom configuration
 --------------------

diff --git a/examples/example_advanced.py b/examples/example_advanced.py
@@ -6,8 +6,8 @@
 
 config = serpscrap.Config()
 
-config.set('scrape_urls', True)
-config.set('num_pages_for_keyword', 2)
+config.set('scrape_urls', False)
+config.set('num_pages_for_keyword', 5)
 config.set('url_threads', 5)
 
 scrap = serpscrap.SerpScrap()
@@ -18,11 +18,17 @@
 
 models = []
 
+print('--- origin titles ---')
 for result in results:
     if 'serp_title' in result and len(result['serp_title']) > 1:
-        model = markovi.get_model(result['serp_title'], 1)
-        if model.state_size > 0:
-            models.append(model)
+        print(result['serp_title'])
+        try:
+            model = markovi.get_model(result['serp_title'], 1)
+            if model.state_size > 0:
+                models.append(model)
+        except Exception:
+            pass
+print('--- --- ---')
 
 model = markovi.get_combined_model(models)
 
@@ -32,21 +38,26 @@
         char_limit=150,
         tries=10,
         max_overlap_ratio=0.7,
-        max_overlap_total=25
+        max_overlap_total=20
     )
     if isinstance(text, str):
         texts.append(text)
 
+print('--- Generated Titles 1. iteration ---')
 for text in texts:
-    print(text+'\n')
+    print(text)
+print('--- --- ---')
 
 tf = serpscrap.TfIdf().get_tfidf(texts)
-print(tf[0:10])
+print('--- TfIdf Titles ---')
+print(tf)
+print('--- --- ---')
 
 model = markovi.get_model("\n".join(texts), 1)
+print('--- Generated Titles 2. iteration ---')
 for _ in range(10):
     text = model.make_short_sentence(
-        char_limit=80,
+        max_chars=80,
         tries=10,
         max_overlap_ratio=0.7,
         max_overlap_total=20

diff --git a/examples/example_markovi.py b/examples/example_markovi.py
@@ -1,9 +1,10 @@
 #!/usr/bin/python3
 # -*- coding: utf-8 -*-
-from serpscrap.markovi import Markovi
+import pprint
+
 from serpscrap.config import Config
+from serpscrap.markovi import Markovi
 from serpscrap.urlscrape import UrlScrape
-import pprint
 
 
 url = 'http://gutenberg.spiegel.de/buch/johann-wolfgang-goethe-gedichte-3670/231'
@@ -18,4 +19,5 @@
     for _ in range(5):
         texts.append(markovi.generate(content.__getitem__('text_raw'), 1))
 
-pprint.pprint(texts, width=120)
+for text in texts:
+    pprint.pprint(text, width=120)
diff --git a/requirements.txt b/requirements.txt
@@ -2,13 +2,13 @@ numpy==1.12.1
 scipy==0.19.0
 scikit-learn==0.18.1
 lxml
-chardet==2.3.0
-beautifulsoup4==4.5.3
+chardet==3.0.2
+beautifulsoup4==4.6.0
 html2text==2016.9.19
-markovify==0.5.4
+markovify==0.6.0
 PySocks==1.6.7
 sqlalchemy==1.0.12
-selenium==3.3.3
-cssselect==0.9.1
+selenium==3.4.1
+cssselect==1.0.1
 requests==2.13.0
 aiohttp==0.21.5
diff --git a/scrapcore/core.py b/scrapcore/core.py
@@ -55,7 +55,7 @@ def main(self, return_results=False, config=None):
         num_workers = int(config.get('num_workers'))
         scrape_method = config.get('scrape_method')
         pages = int(config.get('num_pages_for_keyword', 1))
-        method = config.get('scrape_method', 'http')
+        method = config.get('scrape_method', 'selenium')
 
         result_writer = ResultWriter()
         result_writer.init_outfile(config, force_reload=True)

diff --git a/scrapcore/database.py b/scrapcore/database.py
@@ -149,11 +149,7 @@ def set_values_from_scraper(self, scraper):
         """Populate itself from a scraper object.
 
         A scraper may be any object of type:
-
             - SelScrape
-            - HttpScrape
-            - AsyncHttpScrape
-
         Args:
             A scraper object.
         """

diff --git a/scrapcore/parser/google_parser.py b/scrapcore/parser/google_parser.py
@@ -67,12 +67,12 @@ class GoogleParser(Parser):
             'de_ip': {
                 'container': '#center_col',
                 'result_container': '.ads-ad',
-                'link': 'h3 > a:first-child::attr(href)',
+                'link': 'h3 > a:nth-child(2)::attr(href)',
                 'snippet': '.ads-creative::text',
-                'title': 'h3 > a:first-child::text',
+                'title': 'h3 > a:nth-child(2)::text',
                 'visible_link': '.ads-visurl cite::text',
                 'rating': 'div._Ond _Bu span::text',
-                'sitelinks': 'div.osl::text'
+                'sitelinks': 'ul._wEo::text'
             }
         },
         'ads_aside': {