Skip to content

Commit

Permalink
Merge pull request #8 from ecoron/0.5.2
Browse files Browse the repository at this point in the history
0.5.2
  • Loading branch information
Ronald Schmidt authored May 7, 2017
2 parents 9712c54 + f187358 commit 07ec85c
Show file tree
Hide file tree
Showing 19 changed files with 171 additions and 479 deletions.
4 changes: 2 additions & 2 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,9 @@
# built documents.
#
# The short X.Y version.
version = '0.5'
version = '0.6'
# The full version, including alpha/beta/rc tags.
release = '0.5.1'
release = '0.6.0'

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
Expand Down
3 changes: 2 additions & 1 deletion docs/configuration.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ Default configuration
* search_engines: ['google'] - search engines (google)
* url_threads: 3 - number of threads if scrape_urls is true
* use_own_ip: True - if using proxies set to False

* sleeping_min: 5 - min seconds to sleep between scrapes
* sleeping_max: 15 - max seconds to sleep between scrapes

Custom configuration
--------------------
Expand Down
29 changes: 20 additions & 9 deletions examples/example_advanced.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@

config = serpscrap.Config()

config.set('scrape_urls', True)
config.set('num_pages_for_keyword', 2)
config.set('scrape_urls', False)
config.set('num_pages_for_keyword', 5)
config.set('url_threads', 5)

scrap = serpscrap.SerpScrap()
Expand All @@ -18,11 +18,17 @@

models = []

print('--- origin titles ---')
for result in results:
if 'serp_title' in result and len(result['serp_title']) > 1:
model = markovi.get_model(result['serp_title'], 1)
if model.state_size > 0:
models.append(model)
print(result['serp_title'])
try:
model = markovi.get_model(result['serp_title'], 1)
if model.state_size > 0:
models.append(model)
except Exception:
pass
print('--- --- ---')

model = markovi.get_combined_model(models)

Expand All @@ -32,21 +38,26 @@
char_limit=150,
tries=10,
max_overlap_ratio=0.7,
max_overlap_total=25
max_overlap_total=20
)
if isinstance(text, str):
texts.append(text)

print('--- Generated Titles 1. iteration ---')
for text in texts:
print(text+'\n')
print(text)
print('--- --- ---')

tf = serpscrap.TfIdf().get_tfidf(texts)
print(tf[0:10])
print('--- TfIdf Titles ---')
print(tf)
print('--- --- ---')

model = markovi.get_model("\n".join(texts), 1)
print('--- Generated Titles 2. iteration ---')
for _ in range(10):
text = model.make_short_sentence(
char_limit=80,
max_chars=80,
tries=10,
max_overlap_ratio=0.7,
max_overlap_total=20
Expand Down
8 changes: 5 additions & 3 deletions examples/example_markovi.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
#!/usr/bin/python3
# -*- coding: utf-8 -*-
from serpscrap.markovi import Markovi
import pprint

from serpscrap.config import Config
from serpscrap.markovi import Markovi
from serpscrap.urlscrape import UrlScrape
import pprint


url = 'http://gutenberg.spiegel.de/buch/johann-wolfgang-goethe-gedichte-3670/231'
Expand All @@ -18,4 +19,5 @@
for _ in range(5):
texts.append(markovi.generate(content.__getitem__('text_raw'), 1))

pprint.pprint(texts, width=120)
for text in texts:
pprint.pprint(text, width=120)
10 changes: 5 additions & 5 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@ numpy==1.12.1
scipy==0.19.0
scikit-learn==0.18.1
lxml
chardet==2.3.0
beautifulsoup4==4.5.3
chardet==3.0.2
beautifulsoup4==4.6.0
html2text==2016.9.19
markovify==0.5.4
markovify==0.6.0
PySocks==1.6.7
sqlalchemy==1.0.12
selenium==3.3.3
cssselect==0.9.1
selenium==3.4.1
cssselect==1.0.1
requests==2.13.0
aiohttp==0.21.5
2 changes: 1 addition & 1 deletion scrapcore/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def main(self, return_results=False, config=None):
num_workers = int(config.get('num_workers'))
scrape_method = config.get('scrape_method')
pages = int(config.get('num_pages_for_keyword', 1))
method = config.get('scrape_method', 'http')
method = config.get('scrape_method', 'selenium')

result_writer = ResultWriter()
result_writer.init_outfile(config, force_reload=True)
Expand Down
4 changes: 0 additions & 4 deletions scrapcore/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,11 +149,7 @@ def set_values_from_scraper(self, scraper):
"""Populate itself from a scraper object.
A scraper may be any object of type:
- SelScrape
- HttpScrape
- AsyncHttpScrape
Args:
A scraper object.
"""
Expand Down
6 changes: 3 additions & 3 deletions scrapcore/parser/google_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,12 +67,12 @@ class GoogleParser(Parser):
'de_ip': {
'container': '#center_col',
'result_container': '.ads-ad',
'link': 'h3 > a:first-child::attr(href)',
'link': 'h3 > a:nth-child(2)::attr(href)',
'snippet': '.ads-creative::text',
'title': 'h3 > a:first-child::text',
'title': 'h3 > a:nth-child(2)::text',
'visible_link': '.ads-visurl cite::text',
'rating': 'div._Ond _Bu span::text',
'sitelinks': 'div.osl::text'
'sitelinks': 'ul._wEo::text'
}
},
'ads_aside': {
Expand Down
Loading

0 comments on commit 07ec85c

Please sign in to comment.