You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
138 lines
5.0 KiB
138 lines
5.0 KiB
import puppeteer from 'puppeteer';
|
|
|
|
/**
|
|
* Optimized Threads Search Scraper
|
|
* Uses a MutationObserver to catch posts in a virtualized list and improved scrolling.
|
|
*/
|
|
export async function searchThreads(keyword, limit = 20, cookies = [], searchMode = 'KEYWORD') {
|
|
if (!keyword) {
|
|
throw new Error("Please provide a keyword for the search.");
|
|
}
|
|
|
|
const searchUrl = `https://www.threads.net/search?`
|
|
+ `q=${encodeURIComponent(keyword)}`
|
|
+ `&search_type=TOP`
|
|
+ `&search_mode=${searchMode}`;
|
|
|
|
console.log(`[Search Start] Keyword: "${keyword}" | Target: ${limit} posts`);
|
|
|
|
const browser = await puppeteer.launch({
|
|
headless: true,
|
|
// slowMo: 50,
|
|
args: [
|
|
'--disable-blink-features=AutomationControlled',
|
|
'--no-sandbox',
|
|
'--disable-setuid-sandbox'
|
|
]
|
|
});
|
|
|
|
try {
|
|
const page = await browser.newPage();
|
|
|
|
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36');
|
|
|
|
if (cookies && cookies.length > 0) {
|
|
console.log(`[Auth] Injecting ${cookies.length} cookies...`);
|
|
await page.setCookie(...cookies);
|
|
}
|
|
|
|
await page.setViewport({ width: 1280, height: 1000 });
|
|
|
|
await page.goto(searchUrl, { waitUntil: 'networkidle2', timeout: 60000 });
|
|
|
|
if (page.url().includes('/login')) {
|
|
throw new Error("Authentication required: Redirected to login.");
|
|
}
|
|
|
|
// Wait for initial load
|
|
try {
|
|
await page.waitForSelector('a[href*="/post/"]', { timeout: 15000 });
|
|
} catch (e) {
|
|
console.warn("[Warning] Initial posts not found.");
|
|
}
|
|
|
|
/**
|
|
* The Scrape Logic:
|
|
* 1. Uses MutationObserver to watch for NEW links added to the DOM.
|
|
* 2. Periodically scrolls and waits for the network to fetch more.
|
|
* 3. Collects unique URLs into an Internal Set.
|
|
*/
|
|
const uniqueUrls = await page.evaluate(async (maxItems) => {
|
|
const discovered = new Set();
|
|
|
|
return new Promise((resolve) => {
|
|
// Function to extract valid post links from current DOM
|
|
const grabLinks = () => {
|
|
const links = Array.from(document.querySelectorAll('a[href*="/post/"]'));
|
|
links.forEach(a => {
|
|
const href = a.href;
|
|
// Filter out noise like /reposts or /replies sub-pages
|
|
if (!href.includes('/reposts') && !href.includes('/replies')) {
|
|
// Threads links often have queries, clean them for de-duplication
|
|
const cleanUrl = href.split('?')[0];
|
|
discovered.add(cleanUrl);
|
|
}
|
|
});
|
|
};
|
|
|
|
// Setup observer to catch posts as they are rendered during scroll
|
|
const observer = new MutationObserver(() => {
|
|
grabLinks();
|
|
if (discovered.size >= maxItems) {
|
|
cleanup();
|
|
}
|
|
});
|
|
|
|
observer.observe(document.body, { childList: true, subtree: true });
|
|
|
|
const cleanup = () => {
|
|
observer.disconnect();
|
|
clearInterval(scrollInterval);
|
|
resolve(Array.from(discovered));
|
|
};
|
|
|
|
// Perform incremental scrolling
|
|
let lastHeight = document.body.scrollHeight;
|
|
let scrollAttempts = 0;
|
|
|
|
const scrollInterval = setInterval(() => {
|
|
window.scrollBy(0, 800);
|
|
grabLinks(); // Manual grab just in case
|
|
|
|
if (discovered.size >= maxItems) {
|
|
cleanup();
|
|
return;
|
|
}
|
|
|
|
// Check if we've hit the bottom and no new content is loading
|
|
let newHeight = document.body.scrollHeight;
|
|
if (newHeight === lastHeight) {
|
|
scrollAttempts++;
|
|
if (scrollAttempts > 10) cleanup(); // Stop if stuck for ~5 seconds
|
|
} else {
|
|
lastHeight = newHeight;
|
|
scrollAttempts = 0;
|
|
}
|
|
}, 500);
|
|
|
|
// Safety timeout: 45 seconds total for scraping
|
|
setTimeout(cleanup, 45000);
|
|
});
|
|
}, limit);
|
|
|
|
const resultUrls = uniqueUrls.slice(0, limit);
|
|
console.log(`[Search Success] Extracted ${resultUrls.length} unique post URLs.`);
|
|
|
|
return {
|
|
keyword,
|
|
count: resultUrls.length,
|
|
urls: resultUrls
|
|
};
|
|
|
|
} catch (error) {
|
|
console.error(`[Search Error] ${error.message}`);
|
|
throw error;
|
|
} finally {
|
|
await browser.close();
|
|
}
|
|
} |