-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathhack.py
110 lines (81 loc) · 3.82 KB
/
hack.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/bin/python3
# TODO:
# - Download book. Problem is, the book is defined in HTML and CSS and uses files which are stored on a Scribd server.
# Options:
# - Try to download all the files, images and fonts. Then use the Javascript from Scribd to view it.
# - Try to get the "made" pages (ask from DOM?) as image of pdf, or actually find a way to print it (protected by site).
#
# Tried:
# - Printing the page. This is prevented by the site.
# - Only printing the class="document_container" element.
# - Copying the HTML, CSS and images and view it in a browser.
#
# - Fix PhantomJS
# - Remove more unneeded elements from the page.
# - Start in fullscreen.
import sys
from selenium import webdriver
elementsToRemove = ["page_missing_explanation outer_page only_ie6_border between_page_module",
"autogen_class_views_pdfs_page_blur_promo autogen_class_widgets_base",
"between_page_ads",
"buy_doc_bar outer_page only_ie6_border between_page_module",
"newpage", # All the correct pages get loaded from the scribd server. So remove all present pages (the first view which you can view for free).
"share_row",
#"ratings_row", # Started messing up resizing and fullscreen.
"autogen_class_views_pdfs_upvote autogen_class_widgets_base"]
unblur = [["pageParams.blur = true", "pageParams.blur = false"],
["outer_page only_ie6_border blurred_page", "outer_page only_ie6_border"],
['unselectable="on"', ""]]
def parseArgs(argv):
if(len(argv) != 1):
print("Incorrect usage.\n" +
"Correct usage: ./hack.py [ULR | -h | --help] > [FILE.html]\n"
"Only works on \"scribd.com/doc/\" pages!\n\n"
#"Install PHantomJS for a headless experience (sudo pacman -S phantomjs)." # Phantom is broken
, file=sys.stderr)
sys.exit()
if(argv[0] == "-h" or argv[0] == "--help"):
print("Downloads url's webpage, unblurs pages and prints HTML.\n"
"Only works on \"scribd.com/doc/\" pages!\n"
"Usage: ./hack.py [ULR | -h | --help] > [FILE].html\n\n"
#"Install PHantomJS for a headless experience (sudo pacman -S phantomjs)."
, file=sys.stderr)
sys.exit()
if("scribd.com/" not in argv[0]):
print("Url not from \"scribd.com/.\"\n" +
"Use -h or --help for help.", file=sys.stderr)
sys.exit()
def removeElementByClass(className, driver):
driver.execute_script("var element = document.getElementsByClassName(\"" + className + "\"), i;"
"for(i = element.length - 1; i >= 0; i--) {"
" element[i].parentNode.removeChild(element[i]);"
"}")
# TODO: fix PhantomJS
def startWebDriver():
webDrivers = [webdriver.Chrome, webdriver.Firefox, webdriver.Safari, webdriver.Edge, webdriver.Opera, webdriver.Ie]
for tryDriver in webDrivers:
try:
driver = tryDriver()
except Exception as e:
pass
else:
return driver
print("No webdriver found.", file=sys.stderr)
def main(argv):
parseArgs(argv)
# Download page.
driver = startWebDriver()
driver.get(argv[0])
driver.execute_script('document.title')
# Remove adds, fix scaling on first page and some other minor things.
for className in elementsToRemove:
removeElementByClass(className, driver)
page_source = str(driver.page_source)
driver.quit();
# The actual unblurring of the pages.
for page in unblur:
page_source = page_source.replace(page[0], page[1])
print(page_source)
print("\nDone!", file=sys.stderr)
if __name__ == "__main__":
main(sys.argv[1:])