You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

138 lines
5.0 KiB

import puppeteer from 'puppeteer';
/**
* Optimized Threads Search Scraper
* Uses a MutationObserver to catch posts in a virtualized list and improved scrolling.
*/
export async function searchThreads(keyword, limit = 20, cookies = [], searchMode = 'KEYWORD') {
if (!keyword) {
throw new Error("Please provide a keyword for the search.");
}
const searchUrl = `https://www.threads.net/search?`
+ `q=${encodeURIComponent(keyword)}`
+ `&search_type=TOP`
+ `&search_mode=${searchMode}`;
console.log(`[Search Start] Keyword: "${keyword}" | Target: ${limit} posts`);
const browser = await puppeteer.launch({
headless: true,
// slowMo: 50,
args: [
'--disable-blink-features=AutomationControlled',
'--no-sandbox',
'--disable-setuid-sandbox'
]
});
try {
const page = await browser.newPage();
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36');
if (cookies && cookies.length > 0) {
console.log(`[Auth] Injecting ${cookies.length} cookies...`);
await page.setCookie(...cookies);
}
await page.setViewport({ width: 1280, height: 1000 });
await page.goto(searchUrl, { waitUntil: 'networkidle2', timeout: 60000 });
if (page.url().includes('/login')) {
throw new Error("Authentication required: Redirected to login.");
}
// Wait for initial load
try {
await page.waitForSelector('a[href*="/post/"]', { timeout: 15000 });
} catch (e) {
console.warn("[Warning] Initial posts not found.");
}
/**
* The Scrape Logic:
* 1. Uses MutationObserver to watch for NEW links added to the DOM.
* 2. Periodically scrolls and waits for the network to fetch more.
* 3. Collects unique URLs into an Internal Set.
*/
const uniqueUrls = await page.evaluate(async (maxItems) => {
const discovered = new Set();
return new Promise((resolve) => {
// Function to extract valid post links from current DOM
const grabLinks = () => {
const links = Array.from(document.querySelectorAll('a[href*="/post/"]'));
links.forEach(a => {
const href = a.href;
// Filter out noise like /reposts or /replies sub-pages
if (!href.includes('/reposts') && !href.includes('/replies')) {
// Threads links often have queries, clean them for de-duplication
const cleanUrl = href.split('?')[0];
discovered.add(cleanUrl);
}
});
};
// Setup observer to catch posts as they are rendered during scroll
const observer = new MutationObserver(() => {
grabLinks();
if (discovered.size >= maxItems) {
cleanup();
}
});
observer.observe(document.body, { childList: true, subtree: true });
const cleanup = () => {
observer.disconnect();
clearInterval(scrollInterval);
resolve(Array.from(discovered));
};
// Perform incremental scrolling
let lastHeight = document.body.scrollHeight;
let scrollAttempts = 0;
const scrollInterval = setInterval(() => {
window.scrollBy(0, 800);
grabLinks(); // Manual grab just in case
if (discovered.size >= maxItems) {
cleanup();
return;
}
// Check if we've hit the bottom and no new content is loading
let newHeight = document.body.scrollHeight;
if (newHeight === lastHeight) {
scrollAttempts++;
if (scrollAttempts > 10) cleanup(); // Stop if stuck for ~5 seconds
} else {
lastHeight = newHeight;
scrollAttempts = 0;
}
}, 500);
// Safety timeout: 45 seconds total for scraping
setTimeout(cleanup, 45000);
});
}, limit);
const resultUrls = uniqueUrls.slice(0, limit);
console.log(`[Search Success] Extracted ${resultUrls.length} unique post URLs.`);
return {
keyword,
count: resultUrls.length,
urls: resultUrls
};
} catch (error) {
console.error(`[Search Error] ${error.message}`);
throw error;
} finally {
await browser.close();
}
}