update

2 months ago · 03a16df356
parent c3501c1028
commit 03a16df356
3 changed files with 95 additions and 80 deletions
--- a/v2/app/src/components/graph.jsx
+++ b/v2/app/src/components/graph.jsx
@ -78,6 +78,9 @@ export default function Graph({results}){

        if(!points || points.length===0) return;

+        sendOsc('/clear', JSON.stringify({}));
+        // return;
+
        const keywords=results.filter((point)=>point?.type==='keyword');
        

--- a/v2/scrapper/main.js
+++ b/v2/scrapper/main.js
@ -11,7 +11,7 @@ dotenv.config();
 const cookies = [
  {
    name: 'sessionid',
-    value: '64605724719%3ALlZCmwghVyOAck%3A23%3AAYhptDoKttkGRWkpa5583neohBfLXlGfOlwPPmdP1w',
+    value: '64605724719%3AP70RqmwttERST3%3A1%3AAYhGxKgIRQ7XUEx9LXhEzvt1l21d4SzEAmGYeE-D0Q',
    domain: '.threads.com',
    path:'/',
    httpOnly:true,
@ -29,7 +29,7 @@ const cookies = [
  },
  {
    name:'csrftoken',
-    value:'SI5YedKIeuSAgAbdtfynUwzrmSAGquxH',
+    value:'isG68x9tZt73Uuc6nv1m63f4MveFL7Uy',
    domain: '.threads.com',
    path:'/',
    httpOnly:true,
@ -54,7 +54,7 @@ const Keywords=[

 const Version="v6";
 const DEBUG_MODE=false;
-const SCRAP_TYPE='TAG'; // 'KEYWORD' or 'TAG'
+const SCRAP_TYPE='KEYWORD'; // 'KEYWORD' or 'TAG'
 const CLEAR=true;
 const COLLECTION_NAME='data-v4';

--- a/v2/scrapper/search.js
+++ b/v2/scrapper/search.js
@ -1,27 +1,24 @@
 import puppeteer from 'puppeteer';

 /**
- * Threads Search Scraper
- * Navigates to the search page, enters a keyword, and extracts post URLs.
- * * @param {string} keyword - The search term.
- * @param {number} limit - Maximum number of URLs to return.
- * @param {Array} cookies - Optional: Array of Puppeteer-formatted cookies to handle login.
+ * Optimized Threads Search Scraper
+ * Uses a MutationObserver to catch posts in a virtualized list and improved scrolling.
 */
-export async function searchThreads(keyword, limit = 20, cookies = [], searchMode='KEYWORD') {
+export async function searchThreads(keyword, limit = 20, cookies = [], searchMode = 'KEYWORD') {
    if (!keyword) {
        throw new Error("Please provide a keyword for the search.");
    }

    const searchUrl = `https://www.threads.net/search?`
-        +`q=${encodeURIComponent(keyword)}`
-        +`&search_type=TOP`
-        +`&limit=${limit}`
-        +`&search_mode=${searchMode}`
-        +`&media_type=TEXT`;
-    console.log(`[Search Start] Keyword: "${keyword}" | URL: ${searchUrl}`);
+        + `q=${encodeURIComponent(keyword)}`
+        + `&search_type=TOP`
+        + `&search_mode=${searchMode}`;
+    
+    console.log(`[Search Start] Keyword: "${keyword}" | Target: ${limit} posts`);

    const browser = await puppeteer.launch({
-        headless: "new",
+        headless: true, 
+        // slowMo: 50,
        args: [
            '--disable-blink-features=AutomationControlled',
            '--no-sandbox',
@ -32,89 +29,104 @@ export async function searchThreads(keyword, limit = 20, cookies = [], searchMod
    try {
        const page = await browser.newPage();
        
-        // Anti-detection headers
        await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36');
-        await page.setExtraHTTPHeaders({
-            'Accept-Language': 'en-US,en;q=0.9',
-        });
        
        if (cookies && cookies.length > 0) {
-            console.log(`[Auth] 正在注入 ${cookies.length} 個 Session Cookies...`);
-            try {
-                // 確保每個 cookie 對象至少包含 name, value 以及 domain 或 url
-                await browser.setCookie(...cookies);
-                console.log("[Auth] Cookies 注入成功");
-                // console.log(await browser.cookies()); 
-
-            } catch (cookieError) {
-                console.error("[Auth] Cookies 格式錯誤或注入失敗:", cookieError.message);
-            }
+            console.log(`[Auth] Injecting ${cookies.length} cookies...`);
+            await page.setCookie(...cookies);
        }
        
-        await page.setViewport({ width: 1280, height: 900 });
+        await page.setViewport({ width: 1280, height: 1000 });

-        // Navigate to search results
-        const response = await page.goto(searchUrl, { waitUntil: 'networkidle2', timeout: 60000 });
+        await page.goto(searchUrl, { waitUntil: 'networkidle2', timeout: 60000 });

-        // Check if we were redirected to the login page
-        const currentUrl = page.url();
-        if (currentUrl.includes('/login')) {
-            console.error("[Auth Error] Threads redirected to login. You must provide valid session cookies.");
-            throw new Error("Authentication required: Threads search is restricted to logged-in users.");
+        if (page.url().includes('/login')) {
+            throw new Error("Authentication required: Redirected to login.");
        }

-        // Wait for the results to start appearing
+        // Wait for initial load
        try {
-            // Threads search result container or post links
-            await page.waitForSelector('a[href*="/post/"]', { timeout: 20000 });
+            await page.waitForSelector('a[href*="/post/"]', { timeout: 15000 });
        } catch (e) {
-            console.warn("[Warning] Results did not load. This might be a login wall or no results found.");
-            // Take a screenshot for debugging if needed (local environment only)
-            // await page.screenshot({ path: 'search_debug.png' });
+            console.warn("[Warning] Initial posts not found.");
        }

-        // Scroll logic to gather more results
-        await page.evaluate(async (maxItems) => {
-            await new Promise((resolve) => {
-                let totalHeight = 0;
-                let distance = 500;
-                let timer = setInterval(() => {
-                    let scrollHeight = document.body.scrollHeight;
-                    window.scrollBy(0, distance);
-                    totalHeight += distance;
-
-                    const currentLinks = document.querySelectorAll('a[href*="/post/"]').length;
-                    // Stop if we hit the limit, bottom of page, or a safety cap
-                    if (totalHeight >= scrollHeight || currentLinks >= maxItems || totalHeight > 15000) {
-                        clearInterval(timer);
-                        resolve();
-                    }
-                }, 300);
-            });
-        }, limit);
+        /**
+         * The Scrape Logic: 
+         * 1. Uses MutationObserver to watch for NEW links added to the DOM.
+         * 2. Periodically scrolls and waits for the network to fetch more.
+         * 3. Collects unique URLs into an Internal Set.
+         */
+        const uniqueUrls = await page.evaluate(async (maxItems) => {
+            const discovered = new Set();
            
-        // Extracting the URLs
-        const postUrls = await page.evaluate(() => {
+            return new Promise((resolve) => {
+                // Function to extract valid post links from current DOM
+                const grabLinks = () => {
                    const links = Array.from(document.querySelectorAll('a[href*="/post/"]'));
-            return links
-                .map(a => a.href)
-                // Filter for actual post links and ignore UI elements like repost/share buttons
-                .filter(href => {
-                    const isPost = href.includes('/post/');
-                    const isNotAction = !href.includes('/reposts') && !href.includes('/replies');
-                    return isPost && isNotAction;
+                    links.forEach(a => {
+                        const href = a.href;
+                        // Filter out noise like /reposts or /replies sub-pages
+                        if (!href.includes('/reposts') && !href.includes('/replies')) {
+                            // Threads links often have queries, clean them for de-duplication
+                            const cleanUrl = href.split('?')[0];
+                            discovered.add(cleanUrl);
+                        }
                    });
+                };
+
+                // Setup observer to catch posts as they are rendered during scroll
+                const observer = new MutationObserver(() => {
+                    grabLinks();
+                    if (discovered.size >= maxItems) {
+                        cleanup();
+                    }
                });

-        // Deduplicate using a Set
-        const uniqueUrls = [...new Set(postUrls)].slice(0, limit);
+                observer.observe(document.body, { childList: true, subtree: true });
+
+                const cleanup = () => {
+                    observer.disconnect();
+                    clearInterval(scrollInterval);
+                    resolve(Array.from(discovered));
+                };
+
+                // Perform incremental scrolling
+                let lastHeight = document.body.scrollHeight;
+                let scrollAttempts = 0;
+
+                const scrollInterval = setInterval(() => {
+                    window.scrollBy(0, 800);
+                    grabLinks(); // Manual grab just in case
+
+                    if (discovered.size >= maxItems) {
+                        cleanup();
+                        return;
+                    }
+
+                    // Check if we've hit the bottom and no new content is loading
+                    let newHeight = document.body.scrollHeight;
+                    if (newHeight === lastHeight) {
+                        scrollAttempts++;
+                        if (scrollAttempts > 10) cleanup(); // Stop if stuck for ~5 seconds
+                    } else {
+                        lastHeight = newHeight;
+                        scrollAttempts = 0;
+                    }
+                }, 500);
+
+                // Safety timeout: 45 seconds total for scraping
+                setTimeout(cleanup, 45000);
+            });
+        }, limit);

-        console.log(`[Search Success] Found ${uniqueUrls.length} unique post URLs.`);
+        const resultUrls = uniqueUrls.slice(0, limit);
+        console.log(`[Search Success] Extracted ${resultUrls.length} unique post URLs.`);
        
        return {
            keyword,
-            count: uniqueUrls.length,
-            urls: uniqueUrls
+            count: resultUrls.length,
+            urls: resultUrls
        };

    } catch (error) {