update

4 months ago · 03a16df356
parent c3501c1028
commit 03a16df356
3 changed files with 95 additions and 80 deletions
--- a/v2/app/src/components/graph.jsx
+++ b/v2/app/src/components/graph.jsx
@ -78,6 +78,9 @@ export default function Graph({results}){
        if(!points || points.length===0) return;
        sendOsc('/clear', JSON.stringify({}));
        // return;
        const keywords=results.filter((point)=>point?.type==='keyword');
--- a/v2/scrapper/main.js
+++ b/v2/scrapper/main.js
@ -11,7 +11,7 @@ dotenv.config();
 const cookies = [
  {
    name: 'sessionid',
-    value: '64605724719%3ALlZCmwghVyOAck%3A23%3AAYhptDoKttkGRWkpa5583neohBfLXlGfOlwPPmdP1w',
+    value: '64605724719%3AP70RqmwttERST3%3A1%3AAYhGxKgIRQ7XUEx9LXhEzvt1l21d4SzEAmGYeE-D0Q',
    domain: '.threads.com',
    path:'/',
    httpOnly:true,
@ -29,7 +29,7 @@ const cookies = [
  },
  {
    name:'csrftoken',
-    value:'SI5YedKIeuSAgAbdtfynUwzrmSAGquxH',
+    value:'isG68x9tZt73Uuc6nv1m63f4MveFL7Uy',
    domain: '.threads.com',
    path:'/',
    httpOnly:true,
@ -54,7 +54,7 @@ const Keywords=[
 const Version="v6";
 const DEBUG_MODE=false;
-const SCRAP_TYPE='TAG'; // 'KEYWORD' or 'TAG'
+const SCRAP_TYPE='KEYWORD'; // 'KEYWORD' or 'TAG'
 const CLEAR=true;
 const COLLECTION_NAME='data-v4';
--- a/v2/scrapper/search.js
+++ b/v2/scrapper/search.js
@ -1,27 +1,24 @@
 import puppeteer from 'puppeteer';
 /**
- * Threads Search Scraper
+ * Optimized Threads Search Scraper
- * Navigates to the search page, enters a keyword, and extracts post URLs.
+ * Uses a MutationObserver to catch posts in a virtualized list and improved scrolling.
 * * @param {string} keyword - The search term.
 * @param {number} limit - Maximum number of URLs to return.
 * @param {Array} cookies - Optional: Array of Puppeteer-formatted cookies to handle login.
 */
-export async function searchThreads(keyword, limit = 20, cookies = [], searchMode='KEYWORD') {
+export async function searchThreads(keyword, limit = 20, cookies = [], searchMode = 'KEYWORD') {
    if (!keyword) {
        throw new Error("Please provide a keyword for the search.");
    }
    const searchUrl = `https://www.threads.net/search?`
-        +`q=${encodeURIComponent(keyword)}`
+        + `q=${encodeURIComponent(keyword)}`
-        +`&search_type=TOP`
+        + `&search_type=TOP`
-        +`&limit=${limit}`
+        + `&search_mode=${searchMode}`;
-        +`&search_mode=${searchMode}`
+    
-        +`&media_type=TEXT`;
+    console.log(`[Search Start] Keyword: "${keyword}" | Target: ${limit} posts`);
    console.log(`[Search Start] Keyword: "${keyword}" | URL: ${searchUrl}`);
    const browser = await puppeteer.launch({
-        headless: "new",
+        headless: true, 
        // slowMo: 50,
        args: [
            '--disable-blink-features=AutomationControlled',
            '--no-sandbox',
@ -32,89 +29,104 @@ export async function searchThreads(keyword, limit = 20, cookies = [], searchMod
    try {
        const page = await browser.newPage();
        // Anti-detection headers
        await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36');
        await page.setExtraHTTPHeaders({
            'Accept-Language': 'en-US,en;q=0.9',
        });
        if (cookies && cookies.length > 0) {
-            console.log(`[Auth] 正在注入 ${cookies.length} 個 Session Cookies...`);
+            console.log(`[Auth] Injecting ${cookies.length} cookies...`);
-            try {
+            await page.setCookie(...cookies);
                // 確保每個 cookie 對象至少包含 name, value 以及 domain 或 url
                await browser.setCookie(...cookies);
                console.log("[Auth] Cookies 注入成功");
                // console.log(await browser.cookies()); 
            } catch (cookieError) {
                console.error("[Auth] Cookies 格式錯誤或注入失敗:", cookieError.message);
            }
        }
-        await page.setViewport({ width: 1280, height: 900 });
+        await page.setViewport({ width: 1280, height: 1000 });
-        // Navigate to search results
+        await page.goto(searchUrl, { waitUntil: 'networkidle2', timeout: 60000 });
        const response = await page.goto(searchUrl, { waitUntil: 'networkidle2', timeout: 60000 });
-        // Check if we were redirected to the login page
+        if (page.url().includes('/login')) {
-        const currentUrl = page.url();
+            throw new Error("Authentication required: Redirected to login.");
        if (currentUrl.includes('/login')) {
            console.error("[Auth Error] Threads redirected to login. You must provide valid session cookies.");
            throw new Error("Authentication required: Threads search is restricted to logged-in users.");
        }
-        // Wait for the results to start appearing
+        // Wait for initial load
        try {
-            // Threads search result container or post links
+            await page.waitForSelector('a[href*="/post/"]', { timeout: 15000 });
            await page.waitForSelector('a[href*="/post/"]', { timeout: 20000 });
        } catch (e) {
-            console.warn("[Warning] Results did not load. This might be a login wall or no results found.");
+            console.warn("[Warning] Initial posts not found.");
            // Take a screenshot for debugging if needed (local environment only)
            // await page.screenshot({ path: 'search_debug.png' });
        }
-        // Scroll logic to gather more results
+        /**
-        await page.evaluate(async (maxItems) => {
+         * The Scrape Logic: 
-            await new Promise((resolve) => {
+         * 1. Uses MutationObserver to watch for NEW links added to the DOM.
-                let totalHeight = 0;
+         * 2. Periodically scrolls and waits for the network to fetch more.
-                let distance = 500;
+         * 3. Collects unique URLs into an Internal Set.
-                let timer = setInterval(() => {
+         */
-                    let scrollHeight = document.body.scrollHeight;
+        const uniqueUrls = await page.evaluate(async (maxItems) => {
-                    window.scrollBy(0, distance);
+            const discovered = new Set();
-                    totalHeight += distance;
+            
-
+            return new Promise((resolve) => {
-                    const currentLinks = document.querySelectorAll('a[href*="/post/"]').length;
+                // Function to extract valid post links from current DOM
-                    // Stop if we hit the limit, bottom of page, or a safety cap
+                const grabLinks = () => {
-                    if (totalHeight >= scrollHeight || currentLinks >= maxItems || totalHeight > 15000) {
+                    const links = Array.from(document.querySelectorAll('a[href*="/post/"]'));
-                        clearInterval(timer);
+                    links.forEach(a => {
-                        resolve();
+                        const href = a.href;
                        // Filter out noise like /reposts or /replies sub-pages
                        if (!href.includes('/reposts') && !href.includes('/replies')) {
                            // Threads links often have queries, clean them for de-duplication
                            const cleanUrl = href.split('?')[0];
                            discovered.add(cleanUrl);
                        }
                    });
                };
                // Setup observer to catch posts as they are rendered during scroll
                const observer = new MutationObserver(() => {
                    grabLinks();
                    if (discovered.size >= maxItems) {
                        cleanup();
                    }
                }, 300);
            });
        }, limit);
        // Extracting the URLs
        const postUrls = await page.evaluate(() => {
            const links = Array.from(document.querySelectorAll('a[href*="/post/"]'));
            return links
                .map(a => a.href)
                // Filter for actual post links and ignore UI elements like repost/share buttons
                .filter(href => {
                    const isPost = href.includes('/post/');
                    const isNotAction = !href.includes('/reposts') && !href.includes('/replies');
                    return isPost && isNotAction;
                });
        });
-        // Deduplicate using a Set
+                observer.observe(document.body, { childList: true, subtree: true });
-        const uniqueUrls = [...new Set(postUrls)].slice(0, limit);
+
                const cleanup = () => {
                    observer.disconnect();
                    clearInterval(scrollInterval);
                    resolve(Array.from(discovered));
                };
                // Perform incremental scrolling
                let lastHeight = document.body.scrollHeight;
                let scrollAttempts = 0;
                const scrollInterval = setInterval(() => {
                    window.scrollBy(0, 800);
                    grabLinks(); // Manual grab just in case
                    if (discovered.size >= maxItems) {
                        cleanup();
                        return;
                    }
                    // Check if we've hit the bottom and no new content is loading
                    let newHeight = document.body.scrollHeight;
                    if (newHeight === lastHeight) {
                        scrollAttempts++;
                        if (scrollAttempts > 10) cleanup(); // Stop if stuck for ~5 seconds
                    } else {
                        lastHeight = newHeight;
                        scrollAttempts = 0;
                    }
                }, 500);
                // Safety timeout: 45 seconds total for scraping
                setTimeout(cleanup, 45000);
            });
        }, limit);
-        console.log(`[Search Success] Found ${uniqueUrls.length} unique post URLs.`);
+        const resultUrls = uniqueUrls.slice(0, limit);
        console.log(`[Search Success] Extracted ${resultUrls.length} unique post URLs.`);
        return {
            keyword,
-            count: uniqueUrls.length,
+            count: resultUrls.length,
-            urls: uniqueUrls
+            urls: resultUrls
        };
    } catch (error) {