Fixing the error of a webpage's 404 Not Found that disrupts search re…

…sults
yym68686 · Nov 3, 2023 · 0a7a865 · 0a7a865
1 parent f4bc19c
commit 0a7a865
Show file tree

Hide file tree

Showing 2 changed files with 66 additions and 36 deletions.
diff --git a/agent.py b/agent.py
@@ -254,6 +254,9 @@ def Web_crawler(url: str) -> str:
     try:
         requests.packages.urllib3.disable_warnings()
         response = requests.get(url, headers=headers, verify=False, timeout=5, stream=True)
+        if response.status_code == 404:
+            print("Page not found:", url)
+            return ""
         content_length = int(response.headers.get('Content-Length', 0))
         if content_length > 5000000:
             print("Skipping large file:", url)

diff --git a/test/test_Web_crawler.py b/test/test_Web_crawler.py
@@ -7,23 +7,61 @@
 from bs4 import BeautifulSoup
 from requests.adapters import HTTPAdapter
 
-def Web_crawler(url: str) -> str:
-    """返回链接网址url正文内容，必须是合法的网址"""
-    headers = {
-        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
-    }
-    result = ''
-    try:
-        requests.packages.urllib3.disable_warnings()
-        response = requests.get(url, headers=headers, verify=False)
-        soup = BeautifulSoup(response.text.encode(response.encoding), 'lxml', from_encoding='utf-8')
-        body = "".join(soup.find('body').get_text().split('\n'))
-        result = body
-    except Exception as e:
-        print('\033[31m')
-        print("error", e)
-        print('\033[0m')
-    return result
+# def Web_crawler(url: str) -> str:
+#     """返回链接网址url正文内容，必须是合法的网址"""
+#     headers = {
+#         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
+#     }
+#     result = ''
+#     try:
+#         requests.packages.urllib3.disable_warnings()
+#         response = requests.get(url, headers=headers, verify=False)
+#         soup = BeautifulSoup(response.text.encode(response.encoding), 'lxml', from_encoding='utf-8')
+#         body = "".join(soup.find('body').get_text().split('\n'))
+#         result = body
+#     except Exception as e:
+#         print('\033[31m')
+#         print("error", e)
+#         print('\033[0m')
+#     return result
+
+# def Web_crawler(url: str) -> str:
+#     """返回链接网址url正文内容，必须是合法的网址"""
+#     headers = {
+#         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
+#     }
+#     result = ''
+#     try:
+#         requests.packages.urllib3.disable_warnings()
+#         response = requests.get(url, headers=headers, verify=False, timeout=5, stream=True)
+#         content_length = int(response.headers.get('Content-Length', 0))
+#         if content_length > 500000:
+#             print("Skipping large file:", url)
+#             return result
+#         # detected_encoding = chardet.detect(response.content)['encoding']
+#         # decoded_content = response.content.decode(detected_encoding, errors='replace')
+#         # # soup = BeautifulSoup(response.text, 'html.parser')
+#         # soup = BeautifulSoup(decoded_content, 'lxml')
+#         # # soup = BeautifulSoup(response.text.encode(response.encoding), 'lxml', from_encoding='utf-8')
+#         # body = "".join(soup.find('body').get_text().split('\n'))
+#         # result = body
+
+#         detected_encoding = chardet.detect(response.content)['encoding']
+#         decoded_content = response.content.decode(detected_encoding, errors='ignore')
+#         decoded_content = re.sub(r'[^\u0000-\uFFFF]', ' ', decoded_content)
+#         soup = BeautifulSoup(decoded_content, 'lxml')
+#         body = soup.find('body').get_text()
+#         body = body.replace('\n', ' ')
+#         body = re.sub(r'http[s]?://\S+', ' ', body)
+#         body = re.sub(r'\s+', ' ', body)
+#         result = body
+
+#     except Exception as e:
+#         print('\033[31m')
+#         print("error url", url)
+#         print("error", e)
+#         print('\033[0m')
+#     return result
 
 def Web_crawler(url: str) -> str:
     """返回链接网址url正文内容，必须是合法的网址"""
@@ -34,28 +72,16 @@ def Web_crawler(url: str) -> str:
     try:
         requests.packages.urllib3.disable_warnings()
         response = requests.get(url, headers=headers, verify=False, timeout=5, stream=True)
+        if response.status_code == 404:
+            print("Page not found:", url)
+            return ""
         content_length = int(response.headers.get('Content-Length', 0))
-        if content_length > 500000:
+        if content_length > 5000000:
             print("Skipping large file:", url)
             return result
-        # detected_encoding = chardet.detect(response.content)['encoding']
-        # decoded_content = response.content.decode(detected_encoding, errors='replace')
-        # # soup = BeautifulSoup(response.text, 'html.parser')
-        # soup = BeautifulSoup(decoded_content, 'lxml')
-        # # soup = BeautifulSoup(response.text.encode(response.encoding), 'lxml', from_encoding='utf-8')
-        # body = "".join(soup.find('body').get_text().split('\n'))
-        # result = body
-
-        detected_encoding = chardet.detect(response.content)['encoding']
-        decoded_content = response.content.decode(detected_encoding, errors='ignore')
-        decoded_content = re.sub(r'[^\u0000-\uFFFF]', ' ', decoded_content)
-        soup = BeautifulSoup(decoded_content, 'lxml')
-        body = soup.find('body').get_text()
-        body = body.replace('\n', ' ')
-        body = re.sub(r'http[s]?://\S+', ' ', body)
-        body = re.sub(r'\s+', ' ', body)
+        soup = BeautifulSoup(response.text.encode(response.encoding), 'lxml', from_encoding='utf-8')
+        body = "".join(soup.find('body').get_text().split('\n'))
         result = body
-
     except Exception as e:
         print('\033[31m')
         print("error url", url)
@@ -91,7 +117,8 @@ def Web_crawler(url: str) -> str:
 # for url in ['https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/403', 'https://www.hostinger.com/tutorials/what-is-403-forbidden-error-and-how-to-fix-it', 'https://beebom.com/what-is-403-forbidden-error-how-to-fix/']:
 # for url in ['https://www.lifewire.com/403-forbidden-error-explained-2617989']:
 # for url in ['https://www.usnews.com/news/best-countries/articles/2022-02-24/explainer-why-did-russia-invade-ukraine']:
-for url in ['https://zhidao.baidu.com/question/317577832.html']:
+for url in ['https://github.com/EAimTY/tuic/issues/107']:
+# for url in ['https://zhidao.baidu.com/question/317577832.html']:
 # for url in ['https://www.cnn.com/2023/09/06/tech/huawei-mate-60-pro-phone/index.html']:
 # for url in ['https://www.reddit.com/r/China_irl/comments/15qojkh/46%E6%9C%88%E5%A4%96%E8%B5%84%E5%AF%B9%E4%B8%AD%E5%9B%BD%E7%9B%B4%E6%8E%A5%E6%8A%95%E8%B5%84%E5%87%8F87/', 'https://www.apple.com.cn/job-creation/Apple_China_CSR_Report_2020.pdf', 'https://hdr.undp.org/system/files/documents/hdr2013chpdf.pdf']:
 # for url in ['https://www.airuniversity.af.edu/JIPA/Display/Article/3111127/the-uschina-trade-war-vietnam-emerges-as-the-greatest-winner/']: