Skip to content

Commit

Permalink
Merge pull request #96 from thekingofcity/master
Browse files Browse the repository at this point in the history
samefollow now store in UserRelation
  • Loading branch information
ResolveWang authored Jun 18, 2018
2 parents 2ebee1a + 3b396ea commit 76549ce
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 17 deletions.
6 changes: 5 additions & 1 deletion db/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,11 +58,15 @@ def __repr__(self):
class UserRelation(Base):
__table__ = user_relation

def __init__(self, uid, other_id, type, from_where):
def __init__(self, uid, other_id, type, from_where, crawl_time=True):
self.user_id = uid
self.follow_or_fans_id = other_id
self.type = type
self.from_where = from_where
if crawl_time:
self.crawl_time = func.now()
else:
self.crawl_time = None

def __repr__(self):
return 'user_id:{},follow_or_fans_id:{},type:{},from_where:{}'.format(self.user_id, self.follow_or_fans_id, self.type, self.from_where)
Expand Down
5 changes: 2 additions & 3 deletions db/tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@
Column("contact_info", String(300), default='', server_default=''),
Column("education_info", String(300), default='', server_default=''),
Column("head_img", String(500), default='', server_default=''),
Column("isFan", INTEGER, default=0, server_default='0'),
)

# seed ids for user crawling
Expand Down Expand Up @@ -120,8 +119,8 @@
Column('follow_or_fans_id', String(20)),
Column('type', INTEGER), # 1 stands for fans, 2 stands for follows
Column('from_where', String(60)),
Column('crawl_time', DateTime(3), default=func.now()) # DATETIME(6) means save 6 digits milliseconds
# time is stored in UTC
Column('crawl_time', DateTime(3)) # DATETIME(6) means save 6 digits milliseconds
# time is stored in UTC
)

# dialogue table
Expand Down
5 changes: 3 additions & 2 deletions page_get/user.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,10 +63,11 @@ def get_url_from_web(user_id):
elif domain == '100505':
user = get_user_detail(user_id, html)
samefollow_uid = get_samefollow_uid()
if samefollow_uid:
if samefollow_uid.strip() != '':
samefollow_uid = samefollow_uid.split(',')
url = SAMEFOLLOW_URL.format(user_id)
isFanHtml = get_page(url, auth_level=2)
user.isFan = person.get_isFan(isFanHtml, samefollow_uid)
person.get_isFan(isFanHtml, samefollow_uid, user_id)
# enterprise or service
else:
user = get_enterprise_detail(user_id, html)
Expand Down
54 changes: 43 additions & 11 deletions page_parse/user/person.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import re
import json

from bs4 import BeautifulSoup

from ..user import public
from decorators import parse_decorator
from db.models import User
from db.models import (User, UserRelation)
from db.dao import UserRelationOper


@parse_decorator(0)
Expand Down Expand Up @@ -90,21 +92,21 @@ def get_detail(html, uid):
user.description = description.encode('gbk', 'ignore').decode('gbk')
elif '注册时间:' in each_str:
user.register_time = each.find(attrs={'class': 'pt_detail'}).get_text().replace('\t', '').replace(
'\r\n', '')
'\r\n', '').replace(' ', '')

if '标签信息' in basic_str:
basic_info = each_module.find_all(attrs={'class': 'li_1 clearfix'})
for each in basic_info:
if '标签:' in each.get_text():
user.tags = each.find(attrs={'class': 'pt_detail'}).get_text().replace('\t', '').replace(
'\n\n\n', '') .strip().replace('\r\n', ';')
'\n\n\n', '') .strip().replace('\r\n', ';').replace(' ', '')

if '教育信息' in basic_str:
basic_info = each_module.find_all(attrs={'class': 'li_1 clearfix'})
for each in basic_info:
if '大学:' in each.get_text():
user.education_info = each.find(attrs={'class': 'pt_detail'}).get_text().replace('\r\n', ',') \
.replace('\t', '').replace('\n', ';').lstrip(';').rstrip(';')
.replace('\t', '').replace('\n', ';').lstrip(';').rstrip(';').replace(' ', '')

if '工作信息' in basic_str:
basic_info = each_module.find_all(attrs={'class': 'li_1 clearfix'})
Expand All @@ -114,7 +116,7 @@ def get_detail(html, uid):
jobs = each.find_all(attrs={'class': 'pt_detail'})
for job in jobs:
jobs_info.append(job.get_text().replace('\r\n', '').replace('\t', '').replace('\n', ''))
user.work_info = ';'.join(jobs_info)
user.work_info = ';'.join(jobs_info).replace(' ', '')

if '联系信息' in basic_str:
basic_info = each_module.find_all(attrs={'class': 'li_1 clearfix'})
Expand All @@ -127,26 +129,56 @@ def get_detail(html, uid):
contact_info.append('email:' + each.find(attrs={'class': 'pt_detail'}).get_text())
if 'MSN:' in each.get_text():
contact_info.append('msn:' + each.find(attrs={'class': 'pt_detail'}).get_text())
user.contact_info = ';'.join(contact_info)
user.contact_info = ';'.join(contact_info).replace(' ', '')
except Exception as why:
print('解析出错,具体原因为{why}'.format(why=why))

return user


@parse_decorator(None)
def get_isFan(html, uid):
def get_isFan(html, uids, current_uid):
"""
:param html: samefollow page
:param uid : whether this account follows uid
:param uids: list contains uids to determine this account follows or not
:param current_uid: current crawling user
:return: 1 for yes 0 for no
"""
soup = BeautifulSoup(html, "html.parser")
scripts = soup.find_all('script')
pattern = re.compile(r'FM.view\((.*)\)')

user_ids = list() # Contains uids that the user and crawler both follow
intersection_ids = list() # Contains the intersection of param uids and user_ids
relations = list() # Contains list to be stored in UserRelation table
for script in scripts:
m = pattern.search(script.string)
if m and uid in script.string:
return 1
return 0
# Find the <script>FM.view({"ns":"pl.content.followTab.index","domid":"Pl_Official_HisRelation__59",...
if m and 'pl.content.followTab.index' in script.string:
all_info = m.group(1)
cont = json.loads(all_info).get('html', '')
soup = BeautifulSoup(cont, 'html.parser')
follows = soup.find(attrs={'class': 'follow_box'}).find_all(attrs={'class': 'follow_item'})
patternUID = re.compile(r'uid=(.*?)&')
for follow in follows:
m = re.search(patternUID, str(follow))
if m:
r = m.group(1)
# filter invalid ids
if r.isdigit():
user_ids.append(r)
# Most the same with def get_fans_or_follows(html, uid, type):
# Except the following lines calculate which uids do the user follow
intersection_ids = list(set(user_ids).intersection(set(uids)))
# Now store in the database
type = 1
n = None
for uid in intersection_ids:
relations.append(UserRelation(uid, current_uid, type, n, False))
UserRelationOper.add_all(relations)
break
# legacy support
if intersection_ids:
return 1
else:
return 0
1 change: 1 addition & 0 deletions page_parse/user/public.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,7 @@ def get_fans_or_follows(html, uid, type):
n = n[2:len(n)-2]
user_ids.append(r)
relations.append(UserRelation(uid, r, type, n))
break

UserRelationOper.add_all(relations)
return user_ids
Expand Down

0 comments on commit 76549ce

Please sign in to comment.