import puppeteer from 'puppeteer'; /** * Optimized Threads Search Scraper * Uses a MutationObserver to catch posts in a virtualized list and improved scrolling. */ export async function searchThreads(keyword, limit = 20, cookies = [], searchMode = 'KEYWORD') { if (!keyword) { throw new Error("Please provide a keyword for the search."); } const searchUrl = `https://www.threads.net/search?` + `q=${encodeURIComponent(keyword)}` + `&search_type=TOP` + `&search_mode=${searchMode}`; console.log(`[Search Start] Keyword: "${keyword}" | Target: ${limit} posts`); const browser = await puppeteer.launch({ headless: true, // slowMo: 50, args: [ '--disable-blink-features=AutomationControlled', '--no-sandbox', '--disable-setuid-sandbox' ] }); try { const page = await browser.newPage(); await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'); if (cookies && cookies.length > 0) { console.log(`[Auth] Injecting ${cookies.length} cookies...`); await page.setCookie(...cookies); } await page.setViewport({ width: 1280, height: 1000 }); await page.goto(searchUrl, { waitUntil: 'networkidle2', timeout: 60000 }); if (page.url().includes('/login')) { throw new Error("Authentication required: Redirected to login."); } // Wait for initial load try { await page.waitForSelector('a[href*="/post/"]', { timeout: 15000 }); } catch (e) { console.warn("[Warning] Initial posts not found."); } /** * The Scrape Logic: * 1. Uses MutationObserver to watch for NEW links added to the DOM. * 2. Periodically scrolls and waits for the network to fetch more. * 3. Collects unique URLs into an Internal Set. */ const uniqueUrls = await page.evaluate(async (maxItems) => { const discovered = new Set(); return new Promise((resolve) => { // Function to extract valid post links from current DOM const grabLinks = () => { const links = Array.from(document.querySelectorAll('a[href*="/post/"]')); links.forEach(a => { const href = a.href; // Filter out noise like /reposts or /replies sub-pages if (!href.includes('/reposts') && !href.includes('/replies')) { // Threads links often have queries, clean them for de-duplication const cleanUrl = href.split('?')[0]; discovered.add(cleanUrl); } }); }; // Setup observer to catch posts as they are rendered during scroll const observer = new MutationObserver(() => { grabLinks(); if (discovered.size >= maxItems) { cleanup(); } }); observer.observe(document.body, { childList: true, subtree: true }); const cleanup = () => { observer.disconnect(); clearInterval(scrollInterval); resolve(Array.from(discovered)); }; // Perform incremental scrolling let lastHeight = document.body.scrollHeight; let scrollAttempts = 0; const scrollInterval = setInterval(() => { window.scrollBy(0, 800); grabLinks(); // Manual grab just in case if (discovered.size >= maxItems) { cleanup(); return; } // Check if we've hit the bottom and no new content is loading let newHeight = document.body.scrollHeight; if (newHeight === lastHeight) { scrollAttempts++; if (scrollAttempts > 10) cleanup(); // Stop if stuck for ~5 seconds } else { lastHeight = newHeight; scrollAttempts = 0; } }, 500); // Safety timeout: 45 seconds total for scraping setTimeout(cleanup, 45000); }); }, limit); const resultUrls = uniqueUrls.slice(0, limit); console.log(`[Search Success] Extracted ${resultUrls.length} unique post URLs.`); return { keyword, count: resultUrls.length, urls: resultUrls }; } catch (error) { console.error(`[Search Error] ${error.message}`); throw error; } finally { await browser.close(); } }