Skip to content

Commit

Permalink
Merge pull request #33 from tikazyq/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
Marvin Zhang authored May 27, 2019
2 parents 11ce688 + 8d3d7b3 commit 0a67d6a
Show file tree
Hide file tree
Showing 69 changed files with 2,063 additions and 2,464 deletions.
2 changes: 2 additions & 0 deletions crawlab/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -110,3 +110,5 @@ node_modules/

# egg-info
*.egg-info

tmp/
3 changes: 2 additions & 1 deletion crawlab/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,8 @@
'/api/schedules/<string:id>')
api.add_resource(SiteApi,
'/api/sites',
'/api/sites/<string:id>')
'/api/sites/<string:id>',
'/api/sites/get/<string:action>')


def monitor_nodes_status(celery_app):
Expand Down
2 changes: 1 addition & 1 deletion crawlab/bin/run_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import tasks.deploy

if __name__ == '__main__':
if sys.platform == 'windows':
if 'win' in sys.platform:
celery_app.start(argv=['tasks', 'worker', '-P', 'eventlet', '-E', '-l', 'INFO'])
else:
celery_app.start(argv=['tasks', 'worker', '-E', '-l', 'INFO'])
21 changes: 18 additions & 3 deletions crawlab/constants/spider.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
class SpiderType:
SCRAPY = 'scrapy'
PYSPIDER = 'pyspider'
WEBMAGIC = 'webmagic'
CONFIGURABLE = 'configurable'
CUSTOMIZED = 'customized'


class LangType:
Expand All @@ -17,6 +16,22 @@ class CronEnabled:
OFF = 0


class CrawlType:
LIST = 'list'
DETAIL = 'detail'
LIST_DETAIL = 'list-detail'


class QueryType:
CSS = 'css'
XPATH = 'xpath'


class ExtractType:
TEXT = 'text'
ATTRIBUTE = 'attribute'


SUFFIX_IGNORE = [
'pyc'
]
Expand Down
4 changes: 4 additions & 0 deletions crawlab/db/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,5 +179,9 @@ def create_index(self, col_name: str, keys: dict, **kwargs):
col = self.db[col_name]
col.create_index(keys=keys, **kwargs)

def distinct(self, col_name: str, key: str, filter: dict):
col = self.db[col_name]
return sorted(col.distinct(key, filter))


db_manager = DbManager()
44 changes: 44 additions & 0 deletions crawlab/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,35 +1,79 @@
aiohttp==3.5.4
amqp==2.4.2
aniso8601==6.0.0
Appium-Python-Client==0.40
APScheduler==3.6.0
asn1crypto==0.24.0
async-timeout==3.0.1
attrs==19.1.0
Automat==0.7.0
Babel==2.6.0
beautifulsoup4==4.7.1
billiard==3.6.0.0
bs4==0.0.1
cachetools==3.1.0
celery==4.3.0
certifi==2019.3.9
cffi==1.12.3
chardet==3.0.4
Click==7.0
coloredlogs==10.0
constantly==15.1.0
cryptography==2.6.1
cssselect==1.0.3
Flask==1.0.2
Flask-APScheduler==1.11.0
Flask-Cors==3.0.7
Flask-RESTful==0.3.7
flask-restplus==0.12.1
flower==0.9.3
gevent==1.4.0
greenlet==0.4.15
gunicorn==19.9.0
html5lib==1.0.1
humanfriendly==4.18
hyperlink==19.0.0
idna==2.8
idna-ssl==1.1.0
incremental==17.5.0
itsdangerous==1.1.0
Jinja2==2.10
jsonpickle==1.1
jsonschema==3.0.1
kombu==4.5.0
lxml==4.3.3
MarkupSafe==1.1.1
mongoengine==0.17.0
multidict==4.5.2
parsel==1.5.1
pyasn1==0.4.5
pyasn1-modules==0.2.5
pycparser==2.19
PyDispatcher==2.0.5
PyHamcrest==1.9.0
pymongo==3.7.2
pyOpenSSL==19.0.0
pyrsistent==0.14.11
python-dateutil==2.8.0
pytz==2018.9
queuelib==1.5.0
redis==3.2.1
redisbeat==1.1.4
reppy==0.4.12
requests==2.21.0
Scrapy==1.6.0
selenium==3.141.0
service-identity==18.1.0
six==1.12.0
soupsieve==1.9.1
tornado==5.1.1
Twisted==19.2.0
typing-extensions==3.7.2
tzlocal==1.5.1
urllib3==1.24.1
vine==1.3.0
w3lib==1.20.0
webencodings==0.5.1
Werkzeug==0.15.2
yarl==1.3.0
zope.interface==4.6.0
4 changes: 2 additions & 2 deletions crawlab/routes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ def put(self) -> (dict, tuple):

self.after_update()

return item
return jsonify(item)

def update(self, id: str = None) -> (dict, tuple):
"""
Expand All @@ -137,7 +137,7 @@ def update(self, id: str = None) -> (dict, tuple):
# execute after_update hook
self.after_update(id)

return item
return jsonify(item)

def post(self, id: str = None, action: str = None):
"""
Expand Down
18 changes: 18 additions & 0 deletions crawlab/routes/sites.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ class SiteApi(BaseApi):

arguments = (
('keyword', str),
('main_category', str),
('category', str),
)

Expand Down Expand Up @@ -70,3 +71,20 @@ def get(self, id: str = None, action: str = None):
'page_size': page_size,
'items': jsonify(sites)
}

def get_main_category_list(self, id):
return {
'status': 'ok',
'items': db_manager.distinct(col_name=self.col_name, key='main_category', filter={})
}

def get_category_list(self, id):
args = self.parser.parse_args()
filter_ = {}
if args.get('main_category') is not None:
filter_['main_category'] = args.get('main_category')
return {
'status': 'ok',
'items': db_manager.distinct(col_name=self.col_name, key='category',
filter=filter_)
}
Loading

0 comments on commit 0a67d6a

Please sign in to comment.