import puppeteer from 'puppeteer'; /** * 極限深度查找 (Ultimate Meta Relay Lookup) * 專門針對 ScheduledServerJS -> __bbox -> RelayPrefetchedStreamCache 的參數結構 */ function nestedLookup(obj, keys = ["thread_items", "replies", "child_posts", "threaded_replies", "post", "posts", "edges", "node", "reply_threads"]) { let results = []; if (typeof obj !== "object" || obj === null) return results; if (Array.isArray(obj)) { // 關鍵修正:檢查這是否為 RelayPrefetchedStreamCache 的呼叫陣列 // 格式通常是 ["RelayPrefetchedStreamCache", "next", null, ["key", {數據物件}]] if (obj[0] === "RelayPrefetchedStreamCache" && Array.isArray(obj[3])) { const relayData = obj[3][1]; // 取得包含數據的第二個參數 if (relayData) results = results.concat(nestedLookup(relayData, keys)); } for (const item of obj) { results = results.concat(nestedLookup(item, keys)); } } else { // 處理 __bbox 封裝結構 if (obj.__bbox?.require) { results = results.concat(nestedLookup(obj.__bbox.require, keys)); } if (obj.__bbox?.define) { results = results.concat(nestedLookup(obj.__bbox.define, keys)); } for (const k in obj) { // 匹配目標鍵值 if (keys.includes(k) && obj[k] !== null) { // 如果是列表結構,進一步遞迴 if ((k === "edges" || k === "thread_items" || k === "reply_threads") && Array.isArray(obj[k])) { results = results.concat(nestedLookup(obj[k], keys)); } else { results.push(obj[k]); } } // 繼續深度遍歷 if (typeof obj[k] === "object" && obj[k] !== null) { results = results.concat(nestedLookup(obj[k], keys)); } } } return results; } /** * 解析單一貼文資料 (支援更多 GraphQL 變體路徑) */ function parseThread(data) { if (!data) return null; // 遍歷所有可能的 Post 容器路徑 let post = data.post || data.node?.post || data.thread_items?.[0]?.post || (data.posts && data.posts[0]) || (data.caption ? data : null); // 處理 GraphQL result.data 結構 if (!post && data.result?.data) { const d = data.result.data; post = d.text_post_app_info || d.post || d; } if (!post || (!post.id && !post.pk)) return null; const mediaType = post.media_type; let images = []; let videos = []; let videoThumbnail = null; if (mediaType === 2 && post.video_versions?.length > 0) { const highestResVideo = post.video_versions.reduce((max, cur) => (cur.width > max.width ? cur : max), post.video_versions[0]); if (highestResVideo?.url) videos.push(highestResVideo.url); if (post.image_versions2?.candidates?.length > 0) videoThumbnail = post.image_versions2.candidates[0].url; } else if (mediaType === 1 && post.image_versions2?.candidates?.length > 0) { const highestResImage = post.image_versions2.candidates.reduce((max, cur) => (cur.width > max.width ? cur : max), post.image_versions2.candidates[0]); if (highestResImage?.url) images.push(highestResImage.url); } else if (mediaType === 8 && post.carousel_media?.length > 0) { post.carousel_media.forEach(m => { if (m.media_type === 1 && m.image_versions2?.candidates?.length > 0) images.push(m.image_versions2.candidates[0].url); if (m.media_type === 2 && m.video_versions?.length > 0) videos.push(m.video_versions[0].url); }); } const result = { text: post.caption?.text || post.text_post_app_info?.share_info?.quoted_post?.caption?.text || "", published_on: post.taken_at, id: post.id || post.pk, code: post.code, username: post.user?.username, // user_pic: post.user?.profile_pic_url, like_count: post.like_count || 0, reply_count: post.direct_reply_count || post.reply_count || 0, // images, // videos, // video_thumbnail: videoThumbnail, // url: post.user?.username && post.code ? `https://www.threads.net/@${post.user.username}/post/${post.code}` : null parent_post_id: post.text_post_app_info?.reply_to_author?.id || post.reply_to_post_id || null }; return result.id ? result : null; } async function scrapeThreadDataFromDOM() { const items = []; // Threads 的每一則貼文/回覆通常封裝在 data-pressable-container="true" 的 div 中 const containers = document.querySelectorAll('div[data-pressable-container="true"]'); containers.forEach(container => { try { // 1. 使用者名稱 const userEl = container.querySelector('a[href*="/@"] span[translate="no"]'); const username = userEl ? userEl.innerText.trim() : null; // 2. 貼文內容 (通常在 x1a6qonq 類別中) const textContentEls = container.querySelectorAll('.x1a6qonq span[dir="auto"]'); const text = Array.from(textContentEls).map(el => el.innerText).join('\n').trim(); // 3. 發布時間 const timeEl = container.querySelector('time'); const published_on = timeEl ? timeEl.getAttribute('datetime') : null; // 4. 互動數據 (讚、回覆、轉發) // 我們搜尋包含特定 aria-label 的 SVG const getCountByLabel = (label) => { const svg = container.querySelector(`svg[aria-label="${label}"]`); if (!svg) return 0; // 數據通常在 SVG 旁邊的 span 裡 const countContainer = svg.closest('div').parentElement.querySelector('span'); return countContainer ? countContainer.innerText : "0"; }; const like_count_str = getCountByLabel("讚"); const reply_count_str = getCountByLabel("回覆"); // 5. 貼文 ID / Code (從時間連結中取得) const linkEl = container.querySelector('a[href*="/post/"]'); const postUrl = linkEl ? linkEl.href : null; const code = postUrl ? postUrl.split('/post/')[1]?.split('/')[0] : null; if (username && (text || code)) { items.push({ username, text, published_on, like_count_str, reply_count_str, code, url: postUrl }); } } catch (err) { console.error("DOM Parsing error item:", err); } }); return items; } export async function getThread(postUrl, cookies) { if (!postUrl?.includes("threads.")) throw new Error("無效的 Threads 網址"); const postCodeFromUrl = postUrl.split("/post/")[1]?.split("/")[0]; console.log(`[目標鎖定] 貼文代碼: ${postCodeFromUrl}`); const browser = await puppeteer.launch({ headless: true, args: [ '--disable-blink-features=AutomationControlled', '--no-sandbox', '--disable-setuid-sandbox', '--window-size=1920,1080' ], slowMo: 50 }); try { const page = await browser.newPage(); await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'); if (cookies && cookies.length > 0) { console.log(`[Auth] 正在注入 ${cookies.length} 個 Session Cookies...`); try { // 確保每個 cookie 對象至少包含 name, value 以及 domain 或 url await browser.setCookie(...cookies); console.log("[Auth] Cookies 注入成功"); // console.log(await browser.cookies()); } catch (cookieError) { console.error("[Auth] Cookies 格式錯誤或注入失敗:", cookieError.message); } } await page.setViewport({ width: 1280, height: 900 }); console.log("[2/5] 頁面跳轉中..."); await page.goto(postUrl, { waitUntil: 'domcontentloaded', timeout: 10000 }); try { await page.waitForSelector('main', { timeout: 15000 }); console.log("[3/5] 主容器已載入"); } catch (e) { console.log("[警告] 等待 main 超時,繼續執行..."); } console.log("[4/5] 開始執行滾動腳本..."); console.log("[4/5] 執行模擬滑鼠滾動以加載大量留言..."); // 將滑鼠移動到頁面中心以確保滾動事件被正確接收 await page.mouse.move(960, 540); // 執行多次模擬滑鼠滾輪滾動 const totalScrolls = 25; const scrollDistance = 1000; // 獲取初始高度 let lastHeight = await page.evaluate(() => document.body.scrollHeight); console.log(` - 初始頁面高度: ${lastHeight}px`); for (let i = 0; i < totalScrolls; i++) { await page.mouse.wheel({ deltaY: scrollDistance }); await new Promise(r => setTimeout(r, 1500)); // 稍微增加等待時間讓 GraphQL 有反應 // 每 5 次滾動做一次大幅度動作 if (i % 5 === 0 && i !== 0) { await page.mouse.wheel({ deltaY: -600 }); await new Promise(r => setTimeout(r, 500)); await page.mouse.wheel({ deltaY: 800 }); } // 檢查高度變化 const currentHeight = await page.evaluate(() => document.body.scrollHeight); if (currentHeight > lastHeight) { console.log(` - 第 ${i + 1} 次滾動: 偵測到內容加載!高度增加 ${currentHeight - lastHeight}px (目前: ${currentHeight}px)`); lastHeight = currentHeight; } else { console.log(` - 第 ${i + 1} 次滾動: 高度未變化 (${currentHeight}px)`); } } console.log("[5/5] 等待數據流穩定 (10s)..."); await new Promise(r => setTimeout(r, 10000)); const rawItems = await page.evaluate(scrapeThreadDataFromDOM); // 後處理數據:轉換數字與區分主文/回覆 const processedItems = rawItems.map(item => ({ ...item, like_count: (item.like_count_str), reply_count: (item.reply_count_str), published_at: item.published_on ? new Date(item.published_on).getTime() / 1000 : null })); const mainThread = processedItems.find(t => t.code === postCodeFromUrl); const replies = processedItems.filter(t => t.code !== postCodeFromUrl); if (!mainThread) { console.log("[警告] DOM 中找不到主貼文,嘗試抓取第一個項目作為主文"); } console.log(`[成功] 作者: ${mainThread?.username || '未知'},回覆數: ${replies.length}`); // const allScripts = await page.$$eval( // 'script[type="application/json"]', // (scripts) => scripts.map((s) => s.textContent) // ); // console.log(`[解析] 成功掃描到 ${allScripts.length} 個 JSON 區塊`); // let allParsedItems = new Map(); // allScripts.forEach((content) => { // if (!content || !content.includes("ScheduledServerJS")) return; // try { // const data = JSON.parse(content); // const rawItems = nestedLookup(data).flat(); // rawItems.forEach(item => { // if (!item) return; // const parsed = parseThread(item); // if (parsed && parsed.id) { // allParsedItems.set(parsed.id, parsed); // } // }); // } catch (e) {} // }); // const itemsArray = Array.from(allParsedItems.values()); // const mainThread = itemsArray.find(t => t.code === postCodeFromUrl); // if (!mainThread) { // throw new Error("無法定位主貼文。這通常是因為 Auth 阻擋或頁面未完全渲染。"); // } // const authorName = mainThread.username; // const replies = itemsArray.filter(t => // t.code !== postCodeFromUrl // && mainThread.id.includes(t.parent_post_id) // ).sort((a, b) => a.published_on - b.published_on); // console.log(`[解析成功] 找到主貼文,作者: ${authorName},回覆數量: ${replies.length}`); return { thread: mainThread, replies }; } finally { await browser.close(); } }