Merge pull request #96 from thekingofcity/master

samefollow now store in UserRelation
SpiderClub · Jun 18, 2018 · 76549ce · 76549ce
2 parents 2ebee1a + 3b396ea
commit 76549ce
Show file tree

Hide file tree

Showing 5 changed files with 54 additions and 17 deletions.
diff --git a/db/models.py b/db/models.py
@@ -58,11 +58,15 @@ def __repr__(self):
 class UserRelation(Base):
     __table__ = user_relation
 
-    def __init__(self, uid, other_id, type, from_where):
+    def __init__(self, uid, other_id, type, from_where, crawl_time=True):
         self.user_id = uid
         self.follow_or_fans_id = other_id
         self.type = type
         self.from_where = from_where
+        if crawl_time:
+            self.crawl_time = func.now()
+        else:
+            self.crawl_time = None
 
     def __repr__(self):
         return 'user_id:{},follow_or_fans_id:{},type:{},from_where:{}'.format(self.user_id, self.follow_or_fans_id, self.type, self.from_where)

diff --git a/db/tables.py b/db/tables.py
@@ -32,7 +32,6 @@
                Column("contact_info", String(300), default='', server_default=''),
                Column("education_info", String(300), default='', server_default=''),
                Column("head_img", String(500), default='', server_default=''),
-               Column("isFan", INTEGER, default=0, server_default='0'),
                )
 
 # seed ids for user crawling
@@ -120,8 +119,8 @@
                       Column('follow_or_fans_id', String(20)),
                       Column('type', INTEGER),  # 1 stands for fans, 2 stands for follows
                       Column('from_where', String(60)),
-                      Column('crawl_time', DateTime(3), default=func.now())  # DATETIME(6) means save 6 digits milliseconds
-                                                                           # time is stored in UTC
+                      Column('crawl_time', DateTime(3))  # DATETIME(6) means save 6 digits milliseconds
+                                                         # time is stored in UTC
                       )
 
 # dialogue table

diff --git a/page_get/user.py b/page_get/user.py
@@ -63,10 +63,11 @@ def get_url_from_web(user_id):
         elif domain == '100505':
             user = get_user_detail(user_id, html)
             samefollow_uid = get_samefollow_uid()
-            if samefollow_uid:
+            if samefollow_uid.strip() != '':
+                samefollow_uid = samefollow_uid.split(',')
                 url = SAMEFOLLOW_URL.format(user_id)
                 isFanHtml = get_page(url, auth_level=2)
-                user.isFan = person.get_isFan(isFanHtml, samefollow_uid)
+                person.get_isFan(isFanHtml, samefollow_uid, user_id)
         # enterprise or service
         else:
             user = get_enterprise_detail(user_id, html)

diff --git a/page_parse/user/person.py b/page_parse/user/person.py
@@ -1,10 +1,12 @@
 import re
+import json
 
 from bs4 import BeautifulSoup
 
 from ..user import public
 from decorators import parse_decorator
-from db.models import User
+from db.models import (User, UserRelation)
+from db.dao import UserRelationOper
 
 
 @parse_decorator(0)
@@ -90,21 +92,21 @@ def get_detail(html, uid):
                         user.description = description.encode('gbk', 'ignore').decode('gbk')
                     elif '注册时间：' in each_str:
                         user.register_time = each.find(attrs={'class': 'pt_detail'}).get_text().replace('\t', '').replace(
-                            '\r\n', '')
+                            '\r\n', '').replace(' ', '')
 
             if '标签信息' in basic_str:
                 basic_info = each_module.find_all(attrs={'class': 'li_1 clearfix'})
                 for each in basic_info:
                     if '标签：' in each.get_text():
                         user.tags = each.find(attrs={'class': 'pt_detail'}).get_text().replace('\t', '').replace(
-                            '\n\n\n', '') .strip().replace('\r\n', ';')
+                            '\n\n\n', '') .strip().replace('\r\n', ';').replace(' ', '')
 
             if '教育信息' in basic_str:
                 basic_info = each_module.find_all(attrs={'class': 'li_1 clearfix'})
                 for each in basic_info:
                     if '大学：' in each.get_text():
                         user.education_info = each.find(attrs={'class': 'pt_detail'}).get_text().replace('\r\n', ',') \
-                            .replace('\t', '').replace('\n', ';').lstrip(';').rstrip(';')
+                            .replace('\t', '').replace('\n', ';').lstrip(';').rstrip(';').replace(' ', '')
 
             if '工作信息' in basic_str:
                 basic_info = each_module.find_all(attrs={'class': 'li_1 clearfix'})
@@ -114,7 +116,7 @@ def get_detail(html, uid):
                         jobs = each.find_all(attrs={'class': 'pt_detail'})
                         for job in jobs:
                             jobs_info.append(job.get_text().replace('\r\n', '').replace('\t', '').replace('\n', ''))
-                user.work_info = ';'.join(jobs_info)
+                user.work_info = ';'.join(jobs_info).replace(' ', '')
 
             if '联系信息' in basic_str:
                 basic_info = each_module.find_all(attrs={'class': 'li_1 clearfix'})
@@ -127,26 +129,56 @@ def get_detail(html, uid):
                         contact_info.append('email:' + each.find(attrs={'class': 'pt_detail'}).get_text())
                     if 'MSN：' in each.get_text():
                         contact_info.append('msn:' + each.find(attrs={'class': 'pt_detail'}).get_text())
-                user.contact_info = ';'.join(contact_info)
+                user.contact_info = ';'.join(contact_info).replace(' ', '')
         except Exception as why:
             print('解析出错，具体原因为{why}'.format(why=why))
 
     return user
 
 
 @parse_decorator(None)
-def get_isFan(html, uid):
+def get_isFan(html, uids, current_uid):
     """
     :param html: samefollow page
-    :param uid : whether this account follows uid
+    :param uids: list contains uids to determine this account follows or not
+    :param current_uid: current crawling user
     :return: 1 for yes 0 for no
     """
     soup = BeautifulSoup(html, "html.parser")
     scripts = soup.find_all('script')
     pattern = re.compile(r'FM.view\((.*)\)')
 
+    user_ids = list()  # Contains uids that the user and crawler both follow
+    intersection_ids = list()  # Contains the intersection of param uids and user_ids
+    relations = list()  # Contains list to be stored in UserRelation table
     for script in scripts:
         m = pattern.search(script.string)
-        if m and uid in script.string:
-            return 1
-    return 0
+        # Find the <script>FM.view({"ns":"pl.content.followTab.index","domid":"Pl_Official_HisRelation__59",...
+        if m and 'pl.content.followTab.index' in script.string:
+            all_info = m.group(1)
+            cont = json.loads(all_info).get('html', '')
+            soup = BeautifulSoup(cont, 'html.parser')
+            follows = soup.find(attrs={'class': 'follow_box'}).find_all(attrs={'class': 'follow_item'})
+            patternUID = re.compile(r'uid=(.*?)&')
+            for follow in follows:
+                m = re.search(patternUID, str(follow))
+                if m:
+                    r = m.group(1)
+                    # filter invalid ids
+                    if r.isdigit():
+                        user_ids.append(r)
+            # Most the same with def get_fans_or_follows(html, uid, type):
+            # Except the following lines calculate which uids do the user follow
+            intersection_ids = list(set(user_ids).intersection(set(uids)))
+            # Now store in the database
+            type = 1
+            n = None
+            for uid in intersection_ids:
+                relations.append(UserRelation(uid, current_uid, type, n, False))
+            UserRelationOper.add_all(relations)
+            break
+    # legacy support
+    if intersection_ids:
+        return 1
+    else:
+        return 0
diff --git a/page_parse/user/public.py b/page_parse/user/public.py
@@ -196,6 +196,7 @@ def get_fans_or_follows(html, uid, type):
                             n = n[2:len(n)-2]
                             user_ids.append(r)
                             relations.append(UserRelation(uid, r, type, n))
+            break
 
     UserRelationOper.add_all(relations)
     return user_ids