import puppeteer from 'puppeteer'; /** * Threads Search Scraper * Navigates to the search page, enters a keyword, and extracts post URLs. * * @param {string} keyword - The search term. * @param {number} limit - Maximum number of URLs to return. * @param {Array} cookies - Optional: Array of Puppeteer-formatted cookies to handle login. */ export async function searchThreads(keyword, limit = 20, cookies = [], searchMode='KEYWORD') { if (!keyword) { throw new Error("Please provide a keyword for the search."); } const searchUrl = `https://www.threads.net/search?` +`q=${encodeURIComponent(keyword)}` +`&search_type=TOP` +`&limit=${limit}` +`&search_mode=${searchMode}` +`&media_type=TEXT`; console.log(`[Search Start] Keyword: "${keyword}" | URL: ${searchUrl}`); const browser = await puppeteer.launch({ headless: "new", args: [ '--disable-blink-features=AutomationControlled', '--no-sandbox', '--disable-setuid-sandbox' ] }); try { const page = await browser.newPage(); // Anti-detection headers await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'); await page.setExtraHTTPHeaders({ 'Accept-Language': 'en-US,en;q=0.9', }); // Inject cookies if provided to bypass login wall if (cookies && cookies.length > 0) { console.log("[Auth] Injecting session cookies..."); await page.setCookie(...cookies); } await page.setViewport({ width: 1280, height: 900 }); // Navigate to search results const response = await page.goto(searchUrl, { waitUntil: 'networkidle2', timeout: 60000 }); // Check if we were redirected to the login page const currentUrl = page.url(); if (currentUrl.includes('/login')) { console.error("[Auth Error] Threads redirected to login. You must provide valid session cookies."); throw new Error("Authentication required: Threads search is restricted to logged-in users."); } // Wait for the results to start appearing try { // Threads search result container or post links await page.waitForSelector('a[href*="/post/"]', { timeout: 20000 }); } catch (e) { console.warn("[Warning] Results did not load. This might be a login wall or no results found."); // Take a screenshot for debugging if needed (local environment only) // await page.screenshot({ path: 'search_debug.png' }); } // Scroll logic to gather more results await page.evaluate(async (maxItems) => { await new Promise((resolve) => { let totalHeight = 0; let distance = 500; let timer = setInterval(() => { let scrollHeight = document.body.scrollHeight; window.scrollBy(0, distance); totalHeight += distance; const currentLinks = document.querySelectorAll('a[href*="/post/"]').length; // Stop if we hit the limit, bottom of page, or a safety cap if (totalHeight >= scrollHeight || currentLinks >= maxItems || totalHeight > 15000) { clearInterval(timer); resolve(); } }, 300); }); }, limit); // Extracting the URLs const postUrls = await page.evaluate(() => { const links = Array.from(document.querySelectorAll('a[href*="/post/"]')); return links .map(a => a.href) // Filter for actual post links and ignore UI elements like repost/share buttons .filter(href => { const isPost = href.includes('/post/'); const isNotAction = !href.includes('/reposts') && !href.includes('/replies'); return isPost && isNotAction; }); }); // Deduplicate using a Set const uniqueUrls = [...new Set(postUrls)].slice(0, limit); console.log(`[Search Success] Found ${uniqueUrls.length} unique post URLs.`); return { keyword, count: uniqueUrls.length, urls: uniqueUrls }; } catch (error) { console.error(`[Search Error] ${error.message}`); throw error; } finally { await browser.close(); } }