diff --git a/v2/app/src/components/graph.jsx b/v2/app/src/components/graph.jsx index f02d9f2..54bd6a7 100644 --- a/v2/app/src/components/graph.jsx +++ b/v2/app/src/components/graph.jsx @@ -78,6 +78,9 @@ export default function Graph({results}){ if(!points || points.length===0) return; + sendOsc('/clear', JSON.stringify({})); + // return; + const keywords=results.filter((point)=>point?.type==='keyword'); diff --git a/v2/scrapper/main.js b/v2/scrapper/main.js index 29ee779..fc91564 100644 --- a/v2/scrapper/main.js +++ b/v2/scrapper/main.js @@ -11,7 +11,7 @@ dotenv.config(); const cookies = [ { name: 'sessionid', - value: '64605724719%3ALlZCmwghVyOAck%3A23%3AAYhptDoKttkGRWkpa5583neohBfLXlGfOlwPPmdP1w', + value: '64605724719%3AP70RqmwttERST3%3A1%3AAYhGxKgIRQ7XUEx9LXhEzvt1l21d4SzEAmGYeE-D0Q', domain: '.threads.com', path:'/', httpOnly:true, @@ -29,7 +29,7 @@ const cookies = [ }, { name:'csrftoken', - value:'SI5YedKIeuSAgAbdtfynUwzrmSAGquxH', + value:'isG68x9tZt73Uuc6nv1m63f4MveFL7Uy', domain: '.threads.com', path:'/', httpOnly:true, @@ -54,7 +54,7 @@ const Keywords=[ const Version="v6"; const DEBUG_MODE=false; -const SCRAP_TYPE='TAG'; // 'KEYWORD' or 'TAG' +const SCRAP_TYPE='KEYWORD'; // 'KEYWORD' or 'TAG' const CLEAR=true; const COLLECTION_NAME='data-v4'; diff --git a/v2/scrapper/search.js b/v2/scrapper/search.js index 9c0e5d2..d711814 100644 --- a/v2/scrapper/search.js +++ b/v2/scrapper/search.js @@ -1,27 +1,24 @@ import puppeteer from 'puppeteer'; /** - * Threads Search Scraper - * Navigates to the search page, enters a keyword, and extracts post URLs. - * * @param {string} keyword - The search term. - * @param {number} limit - Maximum number of URLs to return. - * @param {Array} cookies - Optional: Array of Puppeteer-formatted cookies to handle login. + * Optimized Threads Search Scraper + * Uses a MutationObserver to catch posts in a virtualized list and improved scrolling. */ -export async function searchThreads(keyword, limit = 20, cookies = [], searchMode='KEYWORD') { +export async function searchThreads(keyword, limit = 20, cookies = [], searchMode = 'KEYWORD') { if (!keyword) { throw new Error("Please provide a keyword for the search."); } const searchUrl = `https://www.threads.net/search?` - +`q=${encodeURIComponent(keyword)}` - +`&search_type=TOP` - +`&limit=${limit}` - +`&search_mode=${searchMode}` - +`&media_type=TEXT`; - console.log(`[Search Start] Keyword: "${keyword}" | URL: ${searchUrl}`); + + `q=${encodeURIComponent(keyword)}` + + `&search_type=TOP` + + `&search_mode=${searchMode}`; + + console.log(`[Search Start] Keyword: "${keyword}" | Target: ${limit} posts`); const browser = await puppeteer.launch({ - headless: "new", + headless: true, + // slowMo: 50, args: [ '--disable-blink-features=AutomationControlled', '--no-sandbox', @@ -32,89 +29,104 @@ export async function searchThreads(keyword, limit = 20, cookies = [], searchMod try { const page = await browser.newPage(); - // Anti-detection headers await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'); - await page.setExtraHTTPHeaders({ - 'Accept-Language': 'en-US,en;q=0.9', - }); - + if (cookies && cookies.length > 0) { - console.log(`[Auth] 正在注入 ${cookies.length} 個 Session Cookies...`); - try { - // 確保每個 cookie 對象至少包含 name, value 以及 domain 或 url - await browser.setCookie(...cookies); - console.log("[Auth] Cookies 注入成功"); - // console.log(await browser.cookies()); - - } catch (cookieError) { - console.error("[Auth] Cookies 格式錯誤或注入失敗:", cookieError.message); - } + console.log(`[Auth] Injecting ${cookies.length} cookies...`); + await page.setCookie(...cookies); } - await page.setViewport({ width: 1280, height: 900 }); + await page.setViewport({ width: 1280, height: 1000 }); - // Navigate to search results - const response = await page.goto(searchUrl, { waitUntil: 'networkidle2', timeout: 60000 }); + await page.goto(searchUrl, { waitUntil: 'networkidle2', timeout: 60000 }); - // Check if we were redirected to the login page - const currentUrl = page.url(); - if (currentUrl.includes('/login')) { - console.error("[Auth Error] Threads redirected to login. You must provide valid session cookies."); - throw new Error("Authentication required: Threads search is restricted to logged-in users."); + if (page.url().includes('/login')) { + throw new Error("Authentication required: Redirected to login."); } - // Wait for the results to start appearing + // Wait for initial load try { - // Threads search result container or post links - await page.waitForSelector('a[href*="/post/"]', { timeout: 20000 }); + await page.waitForSelector('a[href*="/post/"]', { timeout: 15000 }); } catch (e) { - console.warn("[Warning] Results did not load. This might be a login wall or no results found."); - // Take a screenshot for debugging if needed (local environment only) - // await page.screenshot({ path: 'search_debug.png' }); + console.warn("[Warning] Initial posts not found."); } - // Scroll logic to gather more results - await page.evaluate(async (maxItems) => { - await new Promise((resolve) => { - let totalHeight = 0; - let distance = 500; - let timer = setInterval(() => { - let scrollHeight = document.body.scrollHeight; - window.scrollBy(0, distance); - totalHeight += distance; - - const currentLinks = document.querySelectorAll('a[href*="/post/"]').length; - // Stop if we hit the limit, bottom of page, or a safety cap - if (totalHeight >= scrollHeight || currentLinks >= maxItems || totalHeight > 15000) { - clearInterval(timer); - resolve(); - } - }, 300); - }); - }, limit); + /** + * The Scrape Logic: + * 1. Uses MutationObserver to watch for NEW links added to the DOM. + * 2. Periodically scrolls and waits for the network to fetch more. + * 3. Collects unique URLs into an Internal Set. + */ + const uniqueUrls = await page.evaluate(async (maxItems) => { + const discovered = new Set(); + + return new Promise((resolve) => { + // Function to extract valid post links from current DOM + const grabLinks = () => { + const links = Array.from(document.querySelectorAll('a[href*="/post/"]')); + links.forEach(a => { + const href = a.href; + // Filter out noise like /reposts or /replies sub-pages + if (!href.includes('/reposts') && !href.includes('/replies')) { + // Threads links often have queries, clean them for de-duplication + const cleanUrl = href.split('?')[0]; + discovered.add(cleanUrl); + } + }); + }; - // Extracting the URLs - const postUrls = await page.evaluate(() => { - const links = Array.from(document.querySelectorAll('a[href*="/post/"]')); - return links - .map(a => a.href) - // Filter for actual post links and ignore UI elements like repost/share buttons - .filter(href => { - const isPost = href.includes('/post/'); - const isNotAction = !href.includes('/reposts') && !href.includes('/replies'); - return isPost && isNotAction; + // Setup observer to catch posts as they are rendered during scroll + const observer = new MutationObserver(() => { + grabLinks(); + if (discovered.size >= maxItems) { + cleanup(); + } }); - }); - // Deduplicate using a Set - const uniqueUrls = [...new Set(postUrls)].slice(0, limit); + observer.observe(document.body, { childList: true, subtree: true }); + + const cleanup = () => { + observer.disconnect(); + clearInterval(scrollInterval); + resolve(Array.from(discovered)); + }; + + // Perform incremental scrolling + let lastHeight = document.body.scrollHeight; + let scrollAttempts = 0; + + const scrollInterval = setInterval(() => { + window.scrollBy(0, 800); + grabLinks(); // Manual grab just in case + + if (discovered.size >= maxItems) { + cleanup(); + return; + } + + // Check if we've hit the bottom and no new content is loading + let newHeight = document.body.scrollHeight; + if (newHeight === lastHeight) { + scrollAttempts++; + if (scrollAttempts > 10) cleanup(); // Stop if stuck for ~5 seconds + } else { + lastHeight = newHeight; + scrollAttempts = 0; + } + }, 500); + + // Safety timeout: 45 seconds total for scraping + setTimeout(cleanup, 45000); + }); + }, limit); - console.log(`[Search Success] Found ${uniqueUrls.length} unique post URLs.`); + const resultUrls = uniqueUrls.slice(0, limit); + console.log(`[Search Success] Extracted ${resultUrls.length} unique post URLs.`); return { keyword, - count: uniqueUrls.length, - urls: uniqueUrls + count: resultUrls.length, + urls: resultUrls }; } catch (error) {