You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
323 lines
13 KiB
323 lines
13 KiB
import puppeteer from 'puppeteer';
|
|
|
|
/**
|
|
* 極限深度查找 (Ultimate Meta Relay Lookup)
|
|
* 專門針對 ScheduledServerJS -> __bbox -> RelayPrefetchedStreamCache 的參數結構
|
|
*/
|
|
function nestedLookup(obj, keys = ["thread_items", "replies", "child_posts", "threaded_replies", "post", "posts", "edges", "node", "reply_threads"]) {
|
|
let results = [];
|
|
if (typeof obj !== "object" || obj === null) return results;
|
|
|
|
if (Array.isArray(obj)) {
|
|
// 關鍵修正:檢查這是否為 RelayPrefetchedStreamCache 的呼叫陣列
|
|
// 格式通常是 ["RelayPrefetchedStreamCache", "next", null, ["key", {數據物件}]]
|
|
if (obj[0] === "RelayPrefetchedStreamCache" && Array.isArray(obj[3])) {
|
|
const relayData = obj[3][1]; // 取得包含數據的第二個參數
|
|
if (relayData) results = results.concat(nestedLookup(relayData, keys));
|
|
}
|
|
|
|
for (const item of obj) {
|
|
results = results.concat(nestedLookup(item, keys));
|
|
}
|
|
} else {
|
|
// 處理 __bbox 封裝結構
|
|
if (obj.__bbox?.require) {
|
|
results = results.concat(nestedLookup(obj.__bbox.require, keys));
|
|
}
|
|
if (obj.__bbox?.define) {
|
|
results = results.concat(nestedLookup(obj.__bbox.define, keys));
|
|
}
|
|
|
|
for (const k in obj) {
|
|
// 匹配目標鍵值
|
|
if (keys.includes(k) && obj[k] !== null) {
|
|
// 如果是列表結構,進一步遞迴
|
|
if ((k === "edges" || k === "thread_items" || k === "reply_threads") && Array.isArray(obj[k])) {
|
|
results = results.concat(nestedLookup(obj[k], keys));
|
|
} else {
|
|
results.push(obj[k]);
|
|
}
|
|
}
|
|
|
|
// 繼續深度遍歷
|
|
if (typeof obj[k] === "object" && obj[k] !== null) {
|
|
results = results.concat(nestedLookup(obj[k], keys));
|
|
}
|
|
}
|
|
}
|
|
return results;
|
|
}
|
|
|
|
/**
|
|
* 解析單一貼文資料 (支援更多 GraphQL 變體路徑)
|
|
*/
|
|
function parseThread(data) {
|
|
if (!data) return null;
|
|
|
|
// 遍歷所有可能的 Post 容器路徑
|
|
let post = data.post || data.node?.post || data.thread_items?.[0]?.post || (data.posts && data.posts[0]) || (data.caption ? data : null);
|
|
|
|
// 處理 GraphQL result.data 結構
|
|
if (!post && data.result?.data) {
|
|
const d = data.result.data;
|
|
post = d.text_post_app_info || d.post || d;
|
|
}
|
|
|
|
if (!post || (!post.id && !post.pk)) return null;
|
|
|
|
const mediaType = post.media_type;
|
|
let images = [];
|
|
let videos = [];
|
|
let videoThumbnail = null;
|
|
|
|
if (mediaType === 2 && post.video_versions?.length > 0) {
|
|
const highestResVideo = post.video_versions.reduce((max, cur) => (cur.width > max.width ? cur : max), post.video_versions[0]);
|
|
if (highestResVideo?.url) videos.push(highestResVideo.url);
|
|
if (post.image_versions2?.candidates?.length > 0) videoThumbnail = post.image_versions2.candidates[0].url;
|
|
}
|
|
else if (mediaType === 1 && post.image_versions2?.candidates?.length > 0) {
|
|
const highestResImage = post.image_versions2.candidates.reduce((max, cur) => (cur.width > max.width ? cur : max), post.image_versions2.candidates[0]);
|
|
if (highestResImage?.url) images.push(highestResImage.url);
|
|
}
|
|
else if (mediaType === 8 && post.carousel_media?.length > 0) {
|
|
post.carousel_media.forEach(m => {
|
|
if (m.media_type === 1 && m.image_versions2?.candidates?.length > 0) images.push(m.image_versions2.candidates[0].url);
|
|
if (m.media_type === 2 && m.video_versions?.length > 0) videos.push(m.video_versions[0].url);
|
|
});
|
|
}
|
|
|
|
const result = {
|
|
text: post.caption?.text || post.text_post_app_info?.share_info?.quoted_post?.caption?.text || "",
|
|
published_on: post.taken_at,
|
|
id: post.id || post.pk,
|
|
code: post.code,
|
|
username: post.user?.username,
|
|
// user_pic: post.user?.profile_pic_url,
|
|
like_count: post.like_count || 0,
|
|
reply_count: post.direct_reply_count || post.reply_count || 0,
|
|
// images,
|
|
// videos,
|
|
// video_thumbnail: videoThumbnail,
|
|
// url: post.user?.username && post.code ? `https://www.threads.net/@${post.user.username}/post/${post.code}` : null
|
|
parent_post_id: post.text_post_app_info?.reply_to_author?.id || post.reply_to_post_id || null
|
|
|
|
};
|
|
|
|
return result.id ? result : null;
|
|
}
|
|
async function scrapeThreadDataFromDOM() {
|
|
const items = [];
|
|
// Threads 的每一則貼文/回覆通常封裝在 data-pressable-container="true" 的 div 中
|
|
const containers = document.querySelectorAll('div[data-pressable-container="true"]');
|
|
|
|
containers.forEach(container => {
|
|
try {
|
|
// 1. 使用者名稱
|
|
const userEl = container.querySelector('a[href*="/@"] span[translate="no"]');
|
|
const username = userEl ? userEl.innerText.trim() : null;
|
|
|
|
// 2. 貼文內容 (通常在 x1a6qonq 類別中)
|
|
const textContentEls = container.querySelectorAll('.x1a6qonq span[dir="auto"]');
|
|
const text = Array.from(textContentEls).map(el => el.innerText).join('\n').trim();
|
|
|
|
// 3. 發布時間
|
|
const timeEl = container.querySelector('time');
|
|
const published_on = timeEl ? timeEl.getAttribute('datetime') : null;
|
|
|
|
// 4. 互動數據 (讚、回覆、轉發)
|
|
// 我們搜尋包含特定 aria-label 的 SVG
|
|
const getCountByLabel = (label) => {
|
|
const svg = container.querySelector(`svg[aria-label="${label}"]`);
|
|
if (!svg) return 0;
|
|
// 數據通常在 SVG 旁邊的 span 裡
|
|
const countContainer = svg.closest('div').parentElement.querySelector('span');
|
|
return countContainer ? countContainer.innerText : "0";
|
|
};
|
|
|
|
const like_count_str = getCountByLabel("讚");
|
|
const reply_count_str = getCountByLabel("回覆");
|
|
|
|
// 5. 貼文 ID / Code (從時間連結中取得)
|
|
const linkEl = container.querySelector('a[href*="/post/"]');
|
|
const postUrl = linkEl ? linkEl.href : null;
|
|
const code = postUrl ? postUrl.split('/post/')[1]?.split('/')[0] : null;
|
|
|
|
if (username && (text || code)) {
|
|
items.push({
|
|
username,
|
|
text,
|
|
published_on,
|
|
like_count_str,
|
|
reply_count_str,
|
|
code,
|
|
url: postUrl
|
|
});
|
|
}
|
|
} catch (err) {
|
|
console.error("DOM Parsing error item:", err);
|
|
}
|
|
});
|
|
return items;
|
|
}
|
|
export async function getThread(postUrl, cookies) {
|
|
if (!postUrl?.includes("threads.")) throw new Error("無效的 Threads 網址");
|
|
|
|
const postCodeFromUrl = postUrl.split("/post/")[1]?.split("/")[0];
|
|
console.log(`[目標鎖定] 貼文代碼: ${postCodeFromUrl}`);
|
|
|
|
const browser = await puppeteer.launch({
|
|
headless: true,
|
|
args: [
|
|
'--disable-blink-features=AutomationControlled',
|
|
'--no-sandbox',
|
|
'--disable-setuid-sandbox',
|
|
'--window-size=1920,1080'
|
|
],
|
|
slowMo: 50
|
|
});
|
|
|
|
|
|
try {
|
|
const page = await browser.newPage();
|
|
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
|
|
|
|
if (cookies && cookies.length > 0) {
|
|
console.log(`[Auth] 正在注入 ${cookies.length} 個 Session Cookies...`);
|
|
try {
|
|
// 確保每個 cookie 對象至少包含 name, value 以及 domain 或 url
|
|
await browser.setCookie(...cookies);
|
|
console.log("[Auth] Cookies 注入成功");
|
|
// console.log(await browser.cookies());
|
|
|
|
} catch (cookieError) {
|
|
console.error("[Auth] Cookies 格式錯誤或注入失敗:", cookieError.message);
|
|
}
|
|
}
|
|
|
|
await page.setViewport({ width: 1280, height: 900 });
|
|
|
|
console.log("[2/5] 頁面跳轉中...");
|
|
await page.goto(postUrl, {
|
|
waitUntil: 'domcontentloaded',
|
|
timeout: 10000
|
|
});
|
|
|
|
try {
|
|
await page.waitForSelector('main', { timeout: 15000 });
|
|
console.log("[3/5] 主容器已載入");
|
|
} catch (e) {
|
|
console.log("[警告] 等待 main 超時,繼續執行...");
|
|
}
|
|
|
|
console.log("[4/5] 開始執行滾動腳本...");
|
|
|
|
console.log("[4/5] 執行模擬滑鼠滾動以加載大量留言...");
|
|
|
|
// 將滑鼠移動到頁面中心以確保滾動事件被正確接收
|
|
await page.mouse.move(960, 540);
|
|
|
|
// 執行多次模擬滑鼠滾輪滾動
|
|
const totalScrolls = 30;
|
|
const scrollDistance = 1000;
|
|
|
|
// 獲取初始高度
|
|
let lastHeight = await page.evaluate(() => document.body.scrollHeight);
|
|
console.log(` - 初始頁面高度: ${lastHeight}px`);
|
|
|
|
for (let i = 0; i < totalScrolls; i++) {
|
|
await page.mouse.wheel({ deltaY: scrollDistance });
|
|
await new Promise(r => setTimeout(r, 1500)); // 稍微增加等待時間讓 GraphQL 有反應
|
|
|
|
// 每 5 次滾動做一次大幅度動作
|
|
if (i % 5 === 0 && i !== 0) {
|
|
await page.mouse.wheel({ deltaY: -600 });
|
|
await new Promise(r => setTimeout(r, 500));
|
|
await page.mouse.wheel({ deltaY: 800 });
|
|
}
|
|
|
|
// 檢查高度變化
|
|
let nogaincount=0;
|
|
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
|
|
if (currentHeight > lastHeight) {
|
|
console.log(` - 第 ${i + 1} 次滾動: 偵測到內容加載!高度增加 ${currentHeight - lastHeight}px (目前: ${currentHeight}px)`);
|
|
lastHeight = currentHeight;
|
|
} else {
|
|
console.log(` - 第 ${i + 1} 次滾動: 高度未變化 (${currentHeight}px)`);
|
|
nogaincount++;
|
|
if(nogaincount>=5){
|
|
console.log(" - 偵測到多次無高度變化,提前結束滾動。");
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
console.log("[5/5] 等待數據流穩定 (10s)...");
|
|
await new Promise(r => setTimeout(r, 10000));
|
|
|
|
|
|
const rawItems = await page.evaluate(scrapeThreadDataFromDOM);
|
|
|
|
// 後處理數據:轉換數字與區分主文/回覆
|
|
const processedItems = rawItems.map(item => ({
|
|
...item,
|
|
like_count: (item.like_count_str),
|
|
reply_count: (item.reply_count_str),
|
|
published_at: item.published_on ? new Date(item.published_on).getTime() / 1000 : null
|
|
}));
|
|
|
|
const mainThread = processedItems.find(t => t.code === postCodeFromUrl);
|
|
const replies = processedItems.filter(t => t.code !== postCodeFromUrl);
|
|
|
|
if (!mainThread) {
|
|
console.log("[警告] DOM 中找不到主貼文,嘗試抓取第一個項目作為主文");
|
|
}
|
|
|
|
console.log(`[成功] 作者: ${mainThread?.username || '未知'},回覆數: ${replies.length}`);
|
|
|
|
// const allScripts = await page.$$eval(
|
|
// 'script[type="application/json"]',
|
|
// (scripts) => scripts.map((s) => s.textContent)
|
|
// );
|
|
|
|
// console.log(`[解析] 成功掃描到 ${allScripts.length} 個 JSON 區塊`);
|
|
|
|
// let allParsedItems = new Map();
|
|
|
|
// allScripts.forEach((content) => {
|
|
// if (!content || !content.includes("ScheduledServerJS")) return;
|
|
// try {
|
|
// const data = JSON.parse(content);
|
|
// const rawItems = nestedLookup(data).flat();
|
|
|
|
// rawItems.forEach(item => {
|
|
// if (!item) return;
|
|
// const parsed = parseThread(item);
|
|
// if (parsed && parsed.id) {
|
|
// allParsedItems.set(parsed.id, parsed);
|
|
// }
|
|
// });
|
|
// } catch (e) {}
|
|
// });
|
|
|
|
// const itemsArray = Array.from(allParsedItems.values());
|
|
|
|
// const mainThread = itemsArray.find(t => t.code === postCodeFromUrl);
|
|
|
|
// if (!mainThread) {
|
|
// throw new Error("無法定位主貼文。這通常是因為 Auth 阻擋或頁面未完全渲染。");
|
|
// }
|
|
|
|
// const authorName = mainThread.username;
|
|
// const replies = itemsArray.filter(t =>
|
|
// t.code !== postCodeFromUrl
|
|
// && mainThread.id.includes(t.parent_post_id)
|
|
// ).sort((a, b) => a.published_on - b.published_on);
|
|
|
|
// console.log(`[解析成功] 找到主貼文,作者: ${authorName},回覆數量: ${replies.length}`);
|
|
|
|
return { thread: mainThread, replies };
|
|
|
|
} finally {
|
|
await browser.close();
|
|
}
|
|
} |