-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper_bot.js
272 lines (241 loc) · 9.46 KB
/
scraper_bot.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
const puppeteer = require("puppeteer-extra");
const stealthPlugin = require("puppeteer-extra-plugin-stealth");
const fs = require("fs");
const readline = require("readline");
const axios = require("axios");
const path = require("path");
puppeteer.use(stealthPlugin());
const { executablePath } = require("puppeteer");
const url = "https://rule34.xxx/";
const downloadFile = async (url, directory, filename) => {
try {
const response = await axios({
url,
method: "GET",
responseType: "stream",
});
const outputPath = path.join(directory, filename);
const writer = fs.createWriteStream(outputPath);
response.data.pipe(writer);
return new Promise((resolve, reject) => {
writer.on("finish", resolve);
writer.on("error", reject);
});
} catch (error) {
console.error(`Failed to download ${url}:`, error);
}
};
const getFileExtension = (url) => {
const match = url.match(/\.([a-zA-Z0-9]+)(?:[\?#]|$)/);
return match ? match[1] : "";
};
const main = async () => {
const browser = await puppeteer.launch({
headless: true,
executablePath: executablePath(),
});
const page = await browser.newPage();
await page.goto(url, { waitUntil: "load" });
// Wait for the search form to load
await page.waitForSelector("form > input[type=submit]");
// Read the search tag from the console
const rl = readline.createInterface({
input: process.stdin,
output: process.stdout,
});
console.log("-------------------------------------------");
console.log(" Welcome to Rule34 Scraper Bot ");
console.log("-------------------------------------------");
console.log(
"Just enter the appropriate tags, follow the same convention used in Rule34.\nJson file, Images and Videos folder will be generated in your current path.\nGenerated json file will include images, videos and other meta data.\nIf you encounter an bug, please open a issue: https://github.com/Shivam171/rule34-scraper-bot\n\nNOTE: To close the program enter {ctrl + c}"
);
console.log("-------------------------------------------");
rl.question("Enter your search tags: ", async (searchTag) => {
console.log("-------------------------------------------");
console.log(" Choose your option ");
console.log("1. JSON data Only ");
console.log("2. Images Only ");
console.log("3. Videos Only ");
console.log("4. All above ");
console.log("-------------------------------------------");
rl.question("Enter your choice: ", async (choice) => {
console.log("-------------------------------------------");
const downloadImages = choice === "2" || choice === "4";
const downloadVideos = choice === "3" || choice === "4";
const saveJson = choice === "1" || choice === "4";
// Type the search tag into the search box
await page.type("#tags", searchTag);
// Click the search button
const searchBtn = await page.$("form > input[type=submit]");
if (searchBtn) {
await Promise.all([
searchBtn.click(),
console.log("Searching..."),
page.waitForNavigation({ waitUntil: "load" }),
console.log("Retriving data from rule34..."),
console.log("Note: This may take a while, Please be patient..."),
console.log("-------------------------------------------"),
]);
}
// Close the readline interface after the search is performed
rl.close();
let pageCount = 0;
// Load existing data from the JSON file
let existingData = [];
const outputFileName = `${searchTag.replace(
/\s+/g,
"_"
)}_scrapped_data.json`;
if (fs.existsSync(outputFileName)) {
const rawData = fs.readFileSync(outputFileName);
existingData = JSON.parse(rawData);
}
while (true) {
try {
// Collect image data
const dataList = [];
// Extract href attributes from all a tags inside.thumb spans
const hrefs = await page.evaluate(() => {
const links = [];
document.querySelectorAll(".thumb a").forEach((a) => {
links.push(a.href);
});
return links;
});
// Iterate over each href and navigate to it
for (let i = 0; i < hrefs.length; i++) {
const href = hrefs[i];
console.log(`Navigating to ${i + 1} of ${hrefs.length}`);
// Navigate to the first href and perform click
await page.goto(href, { waitUntil: "networkidle2" });
// Check if it's the first href
if (pageCount === 0 && i === 0) {
// Click on the "Always view original" link
const alwaysViewOriginalLink = await page.$(
"#resized_notice > a:nth-child(2)"
);
if (alwaysViewOriginalLink) {
await alwaysViewOriginalLink.click({ delay: 100 });
}
}
// Extract image, type and video
const data = await page.evaluate(() => {
const imgSrc = document.querySelector("#image")?.src || "";
const type = imgSrc ? "image" : "video";
const src =
imgSrc || document.querySelector("video source")?.src || "";
// Check if the stats section exists
const statsElement = document.querySelector("#stats");
if (!statsElement) {
return {
type,
src,
id: "",
posted_date: "",
posted_on: "",
posted_by: "",
size: "",
external_source: "",
};
}
// Extracting id
const id =
statsElement
.querySelector("li:nth-child(1)")
?.textContent?.trim()
.split(": ")[1] || "";
// Extracting posted by, date and time
const postedText =
statsElement
.querySelector("li:nth-child(2)")
?.textContent?.trim() || "";
const postedParts = postedText.split("\nby\n\n");
const posted_date = postedParts[0].split(" ")[1] || "";
const posted_on = postedParts[0].split(" ")[2] || "";
const posted_by = postedParts[1] || "";
// Extracting size
const size =
statsElement
.querySelector("li:nth-child(3)")
?.textContent?.trim()
.split(": ")[1] || "";
// Extracting external source
const external_source =
statsElement.querySelector("li:nth-child(4) a")?.href || "";
return {
type,
src,
id,
posted_date,
posted_on,
posted_by,
size,
external_source,
};
});
// Check for duplicates before adding
const isDuplicate = existingData.some(
(item) => item.src === data.src && item.id === data.id
);
if (!isDuplicate) {
dataList.push(data);
existingData.push(data);
// Download the media file if it matches the user's choice
const extension = getFileExtension(data.src);
const filename = `${data.id}.${extension}`;
if (data.type === "image" && downloadImages) {
const directory = "images";
if (!fs.existsSync(directory)) {
fs.mkdirSync(directory);
}
await downloadFile(data.src, directory, filename);
} else if (data.type === "video" && downloadVideos) {
const directory = "videos";
if (!fs.existsSync(directory)) {
fs.mkdirSync(directory);
}
await downloadFile(data.src, directory, filename);
}
}
// Save the collected data to a JSON file if user selected option 1 or 4
if (saveJson) {
fs.writeFileSync(
outputFileName,
JSON.stringify(existingData, null, 2),
(err) => {
if (err) {
console.error("Error writing JSON to file:", err);
} else {
console.log("Data written to file successfully.");
}
}
);
}
// Navigate back after visiting each href
await page.goBack();
}
// Select the next button and navigate to the next page
let nextButtonElement = await page.$('a[alt="next"]');
if (nextButtonElement) {
await Promise.all([
nextButtonElement.click(),
page.waitForNavigation({ waitUntil: "load" }),
]);
pageCount++;
console.log(`Visited page count: ${pageCount}`);
} else {
console.log("No more pages to visit!");
break;
}
} catch (err) {
console.log("Some error occured: ", err);
break;
}
}
await browser.close();
});
});
};
main().catch((err) => {
console.log(err);
});