Source-Web-23070-DataAnalysis/v2/scrapper/search.js

import puppeteer from 'puppeteer';

/**
 * Optimized Threads Search Scraper
 * Uses a MutationObserver to catch posts in a virtualized list and improved scrolling.
 */
export async function searchThreads(keyword, limit = 20, cookies = [], searchMode = 'KEYWORD') {
    if (!keyword) {
        throw new Error("Please provide a keyword for the search.");
    }

    const searchUrl = `https://www.threads.net/search?`
        + `q=${encodeURIComponent(keyword)}`
        + `&search_type=TOP`
        + `&search_mode=${searchMode}`;

    console.log(`[Search Start] Keyword: "${keyword}" | Target: ${limit} posts`);

    const browser = await puppeteer.launch({
        headless: true,
        // slowMo: 50,
        args: [
            '--disable-blink-features=AutomationControlled',
            '--no-sandbox',
            '--disable-setuid-sandbox'
        ]
    });

    try {
        const page = await browser.newPage();

        await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36');

        if (cookies && cookies.length > 0) {
            console.log(`[Auth] Injecting ${cookies.length} cookies...`);
            await page.setCookie(...cookies);
        }

        await page.setViewport({ width: 1280, height: 1000 });

        await page.goto(searchUrl, { waitUntil: 'networkidle2', timeout: 60000 });

        if (page.url().includes('/login')) {
            throw new Error("Authentication required: Redirected to login.");
        }

        // Wait for initial load
        try {
            await page.waitForSelector('a[href*="/post/"]', { timeout: 15000 });
        } catch (e) {
            console.warn("[Warning] Initial posts not found.");
        }

        /**
         * The Scrape Logic:
         * 1. Uses MutationObserver to watch for NEW links added to the DOM.
         * 2. Periodically scrolls and waits for the network to fetch more.
         * 3. Collects unique URLs into an Internal Set.
         */
        const uniqueUrls = await page.evaluate(async (maxItems) => {
            const discovered = new Set();

            return new Promise((resolve) => {
                // Function to extract valid post links from current DOM
                const grabLinks = () => {
                    const links = Array.from(document.querySelectorAll('a[href*="/post/"]'));
                    links.forEach(a => {
                        const href = a.href;
                        // Filter out noise like /reposts or /replies sub-pages
                        if (!href.includes('/reposts') && !href.includes('/replies')) {
                            // Threads links often have queries, clean them for de-duplication
                            const cleanUrl = href.split('?')[0];
                            discovered.add(cleanUrl);
                        }
                    });
                };

                // Setup observer to catch posts as they are rendered during scroll
                const observer = new MutationObserver(() => {
                    grabLinks();
                    if (discovered.size >= maxItems) {
                        cleanup();
                    }
                });

                observer.observe(document.body, { childList: true, subtree: true });

                const cleanup = () => {
                    observer.disconnect();
                    clearInterval(scrollInterval);
                    resolve(Array.from(discovered));
                };

                // Perform incremental scrolling
                let lastHeight = document.body.scrollHeight;
                let scrollAttempts = 0;

                const scrollInterval = setInterval(() => {
                    window.scrollBy(0, 800);
                    grabLinks(); // Manual grab just in case

                    if (discovered.size >= maxItems) {
                        cleanup();
                        return;
                    }

                    // Check if we've hit the bottom and no new content is loading
                    let newHeight = document.body.scrollHeight;
                    if (newHeight === lastHeight) {
                        scrollAttempts++;
                        if (scrollAttempts > 10) cleanup(); // Stop if stuck for ~5 seconds
                    } else {
                        lastHeight = newHeight;
                        scrollAttempts = 0;
                    }
                }, 500);

                // Safety timeout: 45 seconds total for scraping
                setTimeout(cleanup, 45000);
            });
        }, limit);

        const resultUrls = uniqueUrls.slice(0, limit);
        console.log(`[Search Success] Extracted ${resultUrls.length} unique post URLs.`);

        return {
            keyword,
            count: resultUrls.length,
            urls: resultUrls
        };

    } catch (error) {
        console.error(`[Search Error] ${error.message}`);
        throw error;
    } finally {
        await browser.close();
    }
}