-
Notifications
You must be signed in to change notification settings - Fork 3
/
ocr.py
65 lines (52 loc) · 1.3 KB
/
ocr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
"""
Program for getting text from meme.
"""
import vendor
vendor.add("lib")
import sys
import requests
import pytesseract
from PIL import Image
from StringIO import StringIO
"""
Convert the image to black and white to
make the white text stand out more.
This usually helps with extracting the text
since the text is usually white.
"""
def darken_background(img, limit=255):
img = img.convert('L') # Create lookup table of rgb values
img = img.point(lambda x: 0 if x<limit else 255, '1') # Change each point
return img
"""
Get the image from the url
"""
def get_image(url, darken=True):
try:
if "http" in url:
img = Image.open(StringIO(requests.get(url).content))
else:
img = Image.open(url)
except IOError:
return None
if darken:
img = darken_background(img)
# Prevent IOError: cannot write mode RGBA as BMP
# Fix by ignoring the alpha value
if len(img.split()) == 4:
r,g,b,a = img.split()
img = Image.merge("RGB", (r,g,b))
return img
"""
Extract the text from the image object
"""
def get_text_from_img(img):
return pytesseract.image_to_string(img)
if __name__ == '__main__':
if len(sys.argv) > 1:
image = get_image(sys.argv[1])
if not image:
print "URL is invalid"
else:
image.save("result.png")
print get_text_from_img(image)