-
Notifications
You must be signed in to change notification settings - Fork 5.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
f8881db
commit d256337
Showing
34 changed files
with
2,913 additions
and
117 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
# 构建 VitePress 站点并将其部署到 GitHub Pages 的示例工作流程 | ||
# | ||
name: Deploy VitePress site to Pages | ||
|
||
on: | ||
# 在针对 `main` 分支的推送上运行。如果你 | ||
# 使用 `master` 分支作为默认分支,请将其更改为 `master` | ||
push: | ||
branches: [main] | ||
|
||
# 允许你从 Actions 选项卡手动运行此工作流程 | ||
workflow_dispatch: | ||
|
||
# 设置 GITHUB_TOKEN 的权限,以允许部署到 GitHub Pages | ||
permissions: | ||
contents: read | ||
pages: write | ||
id-token: write | ||
|
||
# 只允许同时进行一次部署,跳过正在运行和最新队列之间的运行队列 | ||
# 但是,不要取消正在进行的运行,因为我们希望允许这些生产部署完成 | ||
concurrency: | ||
group: pages | ||
cancel-in-progress: false | ||
|
||
jobs: | ||
# 构建工作 | ||
build: | ||
runs-on: ubuntu-latest | ||
steps: | ||
- name: Checkout | ||
uses: actions/checkout@v4 | ||
with: | ||
fetch-depth: 0 # 如果未启用 lastUpdated,则不需要 | ||
# - uses: pnpm/action-setup@v3 # 如果使用 pnpm,请取消注释 | ||
# - uses: oven-sh/setup-bun@v1 # 如果使用 Bun,请取消注释 | ||
- name: Setup Node | ||
uses: actions/setup-node@v4 | ||
with: | ||
node-version: 20 | ||
cache: npm # 或 pnpm / yarn | ||
- name: Setup Pages | ||
uses: actions/configure-pages@v4 | ||
- name: Install dependencies | ||
run: npm ci # 或 pnpm install / yarn install / bun install | ||
- name: Build with VitePress | ||
run: npm run docs:build # 或 pnpm docs:build / yarn docs:build / bun run docs:build | ||
- name: Upload artifact | ||
uses: actions/upload-pages-artifact@v3 | ||
with: | ||
path: docs/.vitepress/dist | ||
|
||
# 部署工作 | ||
deploy: | ||
environment: | ||
name: github-pages | ||
url: ${{ steps.deployment.outputs.page_url }} | ||
needs: build | ||
runs-on: ubuntu-latest | ||
name: Deploy | ||
steps: | ||
- name: Deploy to GitHub Pages | ||
id: deployment | ||
uses: actions/deploy-pages@v4 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -167,4 +167,6 @@ cython_debug/ | |
/data/ | ||
|
||
*/.DS_Store | ||
.vscode | ||
.vscode | ||
/node_modules | ||
docs/.vitepress/cache |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
import {defineConfig} from 'vitepress' | ||
|
||
// https://vitepress.dev/reference/site-config | ||
export default defineConfig({ | ||
title: "MediaCrawler自媒体爬虫", | ||
description: "小红书爬虫,抖音爬虫, 快手爬虫, B站爬虫, 微博爬虫,百度贴吧爬虫,知乎爬虫...。 ", | ||
lastUpdated: true, | ||
themeConfig: { | ||
editLink: { | ||
pattern: 'https://github.com/NanmiCoder/MediaCrawler/tree/main/docs/:path' | ||
}, | ||
// https://vitepress.dev/reference/default-theme-config | ||
nav: [ | ||
{text: '首页', link: '/'}, | ||
{text: '联系我', link: '/作者介绍'}, | ||
{text: '支持我', link: '/知识付费介绍'}, | ||
], | ||
|
||
sidebar: [ | ||
{ | ||
text: '作者介绍', | ||
link: '/作者介绍', | ||
}, | ||
{ | ||
text: 'MediaCrawler使用文档', | ||
items: [ | ||
{text: '基本使用', link: '/'}, | ||
{text: '常见问题汇总', link: '/常见问题'}, | ||
{text: 'IP代理使用', link: '/代理使用'}, | ||
{text: '词云图使用', link: '/词云图使用配置'}, | ||
{text: '项目目录结构', link: '/项目代码结构'}, | ||
{text: '手机号登录说明', link: '/手机号登录说明'}, | ||
] | ||
}, | ||
{ | ||
text: '知识付费', | ||
items: [ | ||
{text: '知识付费介绍', link: '/知识付费介绍'}, | ||
{text: 'MediaCrawlerPro订阅', link: '/mediacrawlerpro订阅'}, | ||
{ | ||
text: 'MediaCrawler源码剖析课', | ||
link: 'https://relakkes.feishu.cn/wiki/JUgBwdhIeiSbAwkFCLkciHdAnhh' | ||
}, | ||
{text: '知识星球文章专栏', link: '/知识星球介绍'}, | ||
{text: '开发者咨询服务', link: '/开发者咨询'}, | ||
] | ||
}, | ||
{ | ||
text: 'MediaCrawler项目交流群', | ||
link: '/作者介绍', | ||
}, | ||
{ | ||
text: '爬虫入门教程分享', | ||
items: [ | ||
{text: "我写的爬虫入门教程", link: 'https://github.com/NanmiCoder/CrawlerTutorial'} | ||
] | ||
}, | ||
|
||
], | ||
|
||
socialLinks: [ | ||
{icon: 'github', link: 'https://github.com/NanmiCoder/MediaCrawler'} | ||
] | ||
} | ||
}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
# MediaCrawler使用方法 | ||
## 免责声明 | ||
> **免责声明:** | ||
> | ||
> 大家请以学习为目的使用本仓库,爬虫违法违规的案件:https://github.com/HiddenStrawberry/Crawler_Illegal_Cases_In_China <br> | ||
> | ||
>本项目的所有内容仅供学习和参考之用,禁止用于商业用途。任何人或组织不得将本仓库的内容用于非法用途或侵犯他人合法权益。本仓库所涉及的爬虫技术仅用于学习和研究,不得用于对其他平台进行大规模爬虫或其他非法行为。对于因使用本仓库内容而引起的任何法律责任,本仓库不承担任何责任。使用本仓库的内容即表示您同意本免责声明的所有条款和条件。 | ||
|
||
## 创建并激活 python 虚拟环境 | ||
```shell | ||
# 进入项目根目录 | ||
cd MediaCrawler | ||
|
||
# 创建虚拟环境 | ||
# 我的python版本是:3.9.6,requirements.txt中的库是基于这个版本的,如果是其他python版本,可能requirements.txt中的库不兼容,自行解决一下。 | ||
python -m venv venv | ||
|
||
# macos & linux 激活虚拟环境 | ||
source venv/bin/activate | ||
|
||
# windows 激活虚拟环境 | ||
venv\Scripts\activate | ||
|
||
``` | ||
|
||
## 安装依赖库 | ||
|
||
```shell | ||
pip install -r requirements.txt | ||
``` | ||
|
||
## 安装 playwright浏览器驱动 | ||
|
||
```shell | ||
playwright install | ||
``` | ||
|
||
## 运行爬虫程序 | ||
|
||
```shell | ||
### 项目默认是没有开启评论爬取模式,如需评论请在config/base_config.py中的 ENABLE_GET_COMMENTS 变量修改 | ||
### 一些其他支持项,也可以在config/base_config.py查看功能,写的有中文注释 | ||
# 从配置文件中读取关键词搜索相关的帖子并爬取帖子信息与评论 | ||
python main.py --platform xhs --lt qrcode --type search | ||
# 从配置文件中读取指定的帖子ID列表获取指定帖子的信息与评论信息 | ||
python main.py --platform xhs --lt qrcode --type detail | ||
# 打开对应APP扫二维码登录 | ||
# 其他平台爬虫使用示例,执行下面的命令查看 | ||
python main.py --help | ||
``` | ||
|
||
## 数据保存 | ||
- 支持关系型数据库Mysql中保存(需要提前创建数据库) | ||
- 执行 `python db.py` 初始化数据库数据库表结构(只在首次执行) | ||
- 支持保存到csv中(data/目录下) | ||
- 支持保存到json中(data/目录下) |
Oops, something went wrong.