-
Notifications
You must be signed in to change notification settings - Fork 15
/
tapas-dl.py
executable file
·297 lines (223 loc) · 12 KB
/
tapas-dl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
#!/bin/env python3
from pyquery import PyQuery as pq
from pathlib import Path
import os
import argparse
import re
import requests
import http.cookiejar
def lead0(num, max):
return str(num).zfill(len(str(max)))
def terminal_size():
try:
import fcntl
import termios
import struct
th, tw, hp, wp = struct.unpack('HHHH', fcntl.ioctl(0, termios.TIOCGWINSZ, struct.pack('HHHH', 0, 0, 0, 0)))
except (IOError, ModuleNotFoundError):
th, tw = 80, 200
return tw, th
def printLine(msg='', noNewLine=False):
terminalWidth = terminal_size()[0]
spaces = terminalWidth - len(msg)
if noNewLine:
if args.verbose:
print(' ' + msg + (' ' * (spaces - 1)))
else:
print(msg + (' ' * spaces), end='\r')
else:
print(msg + (' ' * spaces))
def check_path(path, slash=True, fat=False):
evil_chars = []
if slash:
evil_chars.append('/')
if fat:
evil_chars += ['?', '<', '>', '\\', ':', '*', '|', '"', '^']
return ''.join([char for char in path if char not in evil_chars])
# parse input and settup help
parser = argparse.ArgumentParser(description='Downloads Comics from \'https://tapas.io\'.\nIf folder of downloaded comic is found, it will only update (can be disabled with -f/--force).', formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument('url', metavar='URL/name', type=str, nargs='+',
help='URL or URL name to comic\nGo to the comic you want to download (any page)\nRightclick on the comic name in the upper left corner and select "Copy linkaddress" (Or similar) or just use the name behind series in the url\nExamples: https://tapas.io/series/Erma, RavenWolf, ...')
parser.add_argument('-f', '--force', action="store_true", help='Disables updater.')
parser.add_argument('-v', '--verbose', action="store_true", help='Enables verbose mode.')
parser.add_argument('-r', '--restrict-characters', action="store_true", help='Removes \'? < > \\ : * | " ^\' from file names')
parser.add_argument('-c', '--cookies', type=str, nargs='?', default="", dest='cookies', metavar='PATH',
help='Optional cookies.txt file to load, can be used to allow the script to "log in" and circumvent age verification.')
parser.add_argument('-o', '--output-dir', type=str, nargs='?', default="", dest='baseDir', metavar='PATH',
help='Output directory where comics should be placed.\nIf left blank, the script folder will be used.')
args = parser.parse_args()
s = requests.Session()
s.headers.update({'user-agent': 'tapas-dl'})
if args.cookies:
s.cookies = http.cookiejar.MozillaCookieJar()
s.cookies.load(args.cookies, ignore_discard=True, ignore_expires=True)
# FIXME Those cookies should be set, but no idea how...
#s.cookies.update({'birthDate': '1901-01-01'})
#s.cookies.update({'adjustedBirthDate': '1901-01-01'})
basePath = ""
if (args.baseDir):
basePath = Path(args.baseDir)
for urlCount, url in enumerate(args.url):
# check url/name
if re.match(r'^https://tapas\.io/series/.+$', url):
urlName = url[url.rindex('/') + 1:]
else:
urlName = url
printLine('Loading ' + urlName + '...', True)
# Get comic start page and test if comic exsists
pageReqest = s.get('https://tapas.io/series/' + urlName)
if pageReqest.status_code != 200:
printLine('Error: Comic "{}" not found\n'.format(urlName))
break
page = pq(pageReqest.text)
name = page('.center-info__title.center-info__title--small').text()
author = page('div.viewer-section.viewer-section--episode a.name').text()
seriesId = page('.subscribe-btn').attr('data-id')
if len(page('.row-item--info > img')) > 0:
headerSrc = page('.row-item--info > img').attr('src')
else:
headerSrc = None
data = []
page = pq(s.get(f'https://tapas.io/series/{seriesId}/episodes?page=1&sort=OLDEST&max_limit=99999999') # It's over 9000! But I love that they forgot to limit the max_limit, because that means I don't have to bother with pagination ^^
.json()['data']['body'])
for episode in page('[data-permalink*="/episode/"]'):
data.append({'id': int(episode.attrib['data-permalink'][episode.attrib['data-permalink'].rfind('/') + 1:])})
printLine('{} [{}] ({} pages):'.format(name, urlName, len(data)))
# Check if folder exsists, if not create it
printLine('Checking folder...', True)
# If the user specified a base output directory, prepend that on our folder
savePath = check_path('{} [{}]'.format(name, urlName), fat=args.restrict_characters)
if (basePath != ""):
savePath = os.path.join(basePath, savePath)
printLine('Full path is: ' + str(savePath))
if os.path.isdir(savePath) and not args.force:
printLine('Found directory, only updating (use -f/--force to disable)')
filesInDir = list(os.scandir(savePath))
fileNames = []
for fileInDir in filesInDir:
fileNames.append(fileInDir.name)
fileNames.sort()
imgOffset = len(fileNames)
if imgOffset > 1:
lastFile = fileNames[-1]
lastPageId = int(lastFile[lastFile.rindex('#') + 1:lastFile.rindex('.')])
pageOffset = next(i for i, page in enumerate(data) if page['id'] == lastPageId) + 1
data = data[pageOffset:]
else:
pageOffset = 0
else:
if not os.path.isdir(savePath):
os.mkdir(savePath)
printLine('Creating folder...', True)
pageOffset = 0
imgOffset = 0
# Download header
if True not in [file.name.startswith('-1 - header.') for file in os.scandir(savePath)]:
printLine('Downloading header...', True)
if headerSrc is not None:
with open(os.path.join(savePath, '-1 - header.{}'.format(headerSrc[headerSrc.rindex('.') + 1:])), 'wb') as f:
f.write(s.get(headerSrc).content)
printLine('Downloaded header')
else:
printLine('Header not found')
if len(data) <= 0:
print('Nothing todo: No pages found or all already downloaded\n')
continue
# Check if series is comic or novel
if len(pq(s.get(f'https://tapas.io/episode/{data[0]["id"]}').content)('.content__img.js-lazy')) > 0:
printLine('Detected comic')
# Get images from page from JS api
allImgCount = 0
for pageCount, pageData in enumerate(data):
# Test whether the page we have in mind is reachable
pageReqest = s.get(f'https://tapas.io/episode/{pageData["id"]}')
if pageReqest.status_code != 200:
# This page was unavailable. Let the user know and add a single dummy image entry.
# (We will check for this when we go to download images.)
printLine('Error: "{}" page {}/{} not found. Page Request yielded: {} (Early Access page?)'.format(urlName, pageCount + pageOffset, len(data) + pageOffset, str(pageReqest.status_code)), True)
pageData['title'] = "PageUnavailable"
pageData['imgs'] = []
pageData['imgs'].append("PageUnavailable")
else:
# If the page did not yield an access error, go ahead an scrape for image entries.
pageHtml = pq(s.get(f'https://tapas.io/episode/{pageData["id"]}').content)
printLine('Downloaded image data from {} images (pages {}/{})...'.format(allImgCount, pageCount + pageOffset, len(data) + pageOffset), True)
pageData['title'] = pageHtml('.info__title').text()
pageData['imgs'] = []
for img in pageHtml('.content__img'):
pageData['imgs'].append(pq(img).attr('data-src'))
allImgCount += 1
# Download images
imgCount = 0
for pageCount, pageData in enumerate(data):
for imgOfPageCount, img in enumerate(pageData['imgs']):
# Check if the first image entry is the fummy text that indicates the page was unavailable when we tried to scrape it.
if pageData['imgs'][0] != "PageUnavailable":
# If the entry isn't a dummy entry, go ahead and download the images it contains.
with open(os.path.join(savePath, check_path('{} - {} - {} - {} - #{}.{}'.format(lead0(imgCount + imgOffset, allImgCount + imgOffset), lead0(pageCount + pageOffset, len(pageData) + pageOffset),
lead0(imgOfPageCount, len(pageData['imgs'])), pageData['title'], pageData['id'], img[img.rindex('.') + 1:]),
fat=args.restrict_characters)), 'wb') as f:
f.write(s.get(img).content)
imgCount += 1
printLine('Downloaded image {}/{} from page {}/{} ({}/{} images)...'.format(imgOfPageCount + 1, len(pageData['imgs']), pageCount + pageOffset, len(data) + pageOffset, imgCount + imgOffset, allImgCount + imgOffset), True)
else:
# If the entry was a dummy entry, skip it and let the user know.
printLine('Error: No images downloaded from page {}/{}.'.format(pageCount + pageOffset, len(data) + pageOffset), True)
if data != []:
printLine('Downloaded {} of {} images'.format(imgCount, allImgCount))
else:
printLine('Nothing to do')
if urlCount + 1 != len(args.url):
printLine()
else:
printLine('Detected novel')
from ebooklib import epub
# download/create epub
book = epub.EpubBook()
customCss = None
# Add meta data
book.set_identifier(urlName)
book.set_title(name)
book.set_language('en')
book.add_author(author)
header_name = os.path.join(savePath, list(filter(re.compile(r'.+header\..+').match, os.listdir(savePath)))[0])
book.set_cover("cover.jpg", open(header_name, 'rb').read())
book.toc = []
book.spine = ['cover']
# create about page
chapter = epub.EpubHtml(title='about', file_name='about.xhtml')
chapter.content = f'<h1>About</h1><p>Title: {name}</p><p>Author: {book.metadata["http://purl.org/dc/elements/1.1/"]["creator"][0][0]}</p><p>Source: <a href="{"https://tapas.io/series/" + urlName}">{"https://tapas.io/series/" + urlName}</a></p>'
book.add_item(chapter)
book.spine.append(chapter)
# Append nav page
book.spine.append('nav')
# create chapters
for pageCount, pageData in enumerate(data):
printLine('Downloaded page {}/{}...'.format(pageCount + 1, len(data)), True)
pagePq = pq(s.get(f'https://tapas.io/episode/{pageData["id"]}').content)
pageTitle = pagePq('.viewer__header > .title').text()
pageHtml = f'<h1>{pageTitle}</h1>'
for p in pagePq('article.viewer__body > div.ep-epub-content p'):
p = pq(p)
if p.text() is not None:
pageHtml += '<p>' + p.text() + '</p>'
chapter = epub.EpubHtml(title=pageTitle, file_name=str(pageData['id']) + '.xhtml')
chapter.content = pageHtml
book.add_item(chapter)
# define Table Of Contents
book.toc.append(epub.Link(str(pageData['id']) + '.xhtml', pageTitle, str(pageData['id'])))
# basic spine
book.spine.append(chapter)
# add default NCX and Nav file
book.add_item(epub.EpubNcx())
book.add_item(epub.EpubNav())
# add CSS
style = ''
nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style)
book.add_item(nav_css)
# write to the file
epub.write_epub(os.path.join(os.path.dirname(savePath), check_path(f'{name}.epub', fat=args.restrict_characters)), book)
# remove tmp folder
for file in os.listdir(savePath):
os.remove(os.path.join(savePath, file))
os.removedirs(savePath)