Skip to content

Commit

Permalink
Merge pull request #87 from goodbest/master
Browse files Browse the repository at this point in the history
抓评论时加入表情功能
  • Loading branch information
ResolveWang authored Apr 20, 2018
2 parents 61de71e + 1a8c0cb commit 2ebee1a
Show file tree
Hide file tree
Showing 5 changed files with 65 additions and 4 deletions.
2 changes: 1 addition & 1 deletion db/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
def get_engine():
args = get_db_args()
password = os.getenv('DB_PASS', args['password'])
connect_str = "{}+pymysql://{}:{}@{}:{}/{}?charset=utf8".format(args['db_type'], args['user'], password,
connect_str = "{}+pymysql://{}:{}@{}:{}/{}?charset=utf8mb4".format(args['db_type'], args['user'], password,
args['host'], args['port'], args['db_name'])
engine = create_engine(connect_str, encoding='utf-8')
return engine
Expand Down
1 change: 1 addition & 0 deletions db/tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@
Column("id", INTEGER, primary_key=True, autoincrement=True),
Column("comment_id", String(50), unique=True),
Column("comment_cont", Text),
Column("comment_screen_name", Text),
Column("weibo_id", String(200)),
Column("user_id", String(20)),
Column("create_time", String(200)),
Expand Down
44 changes: 41 additions & 3 deletions page_parse/comment.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from logger import parser
from db.models import WeiboComment
from decorators import parse_decorator

from utils import parse_emoji

@parse_decorator('')
def get_html_cont(html):
Expand Down Expand Up @@ -63,14 +63,52 @@ def get_comment_list(html, wb_id):
if not cont:
return list()

soup = BeautifulSoup(cont, 'html.parser')
soup = BeautifulSoup(cont, 'html5lib')
comment_list = list()
comments = soup.find(attrs={'node-type': 'comment_list'}).find_all(attrs={'class': 'list_li S_line1 clearfix'})

for comment in comments:
wb_comment = WeiboComment()
try:
wb_comment.comment_cont = comment.find(attrs={'class': 'WB_text'}).text.strip()
cont = []
first_author=True
first_colon=True
for content in comment.find(attrs={'class': 'WB_text'}).contents:
if not content:
continue
if content.name =='a':
if first_author:
first_author=False
continue
else:
if content.text:
cont.append(content.text)

elif content.name=='img':
img_title = content.get('title', '')
if img_title=='':
img_title = content.get('alt', '')
if img_title=='':
img_src = content.get('src','')
img_src = img_src.split('/')[-1].split('.',1)[0]
try:
img_title = parse_emoji.softband_to_utf8(img_src)
except Exception as e:
parser.error('解析表情失败,具体信息是{},{}'.format(e, comment))
img_title = ''
cont.append(img_title)

else:
if first_colon:
if content.find(':')==0:
cont.append(content.replace(':','',1))
first_colon=False
else:
cont.append(content)

wb_comment.comment_cont = ''.join(cont)
wb_comment.comment_screen_name =comment.find(attrs={'class': 'WB_text'}).find('a').text

wb_comment.comment_id = comment['comment_id']
# TODO 将wb_comment.user_id加入待爬队列(seed_ids)
wb_comment.user_id = comment.find(attrs={'class': 'WB_text'}).find('a').get('usercard')[3:]
Expand Down
1 change: 1 addition & 0 deletions utils/emoji_ios6.json

Large diffs are not rendered by default.

21 changes: 21 additions & 0 deletions utils/parse_emoji.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import json



def load_emoji_map(fn = 'utils/emoji_ios6.json'):
json_data = json.load(open(fn, encoding='utf-8'))
sb_dict = {}
for m in json_data:
sb_dict[m['sb'].lower()]=m['utf8']
return sb_dict


def softband_to_utf8(emoji):
hex_emoji = sb_dict.get(emoji.lower(), '')
if hex_emoji:
return bytes.fromhex(hex_emoji).decode('utf-8')
else:
return ''

sb_dict = load_emoji_map()

0 comments on commit 2ebee1a

Please sign in to comment.