|
|
|
@ -105,83 +105,209 @@ function parseThread(data) { |
|
|
|
|
|
|
|
|
|
|
|
return result.id ? result : null; |
|
|
|
return result.id ? result : null; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
async function scrapeThreadDataFromDOM() { |
|
|
|
|
|
|
|
const items = []; |
|
|
|
|
|
|
|
// Threads 的每一則貼文/回覆通常封裝在 data-pressable-container="true" 的 div 中
|
|
|
|
|
|
|
|
const containers = document.querySelectorAll('div[data-pressable-container="true"]'); |
|
|
|
|
|
|
|
|
|
|
|
export async function getThread(postUrl) { |
|
|
|
containers.forEach(container => { |
|
|
|
|
|
|
|
try { |
|
|
|
|
|
|
|
// 1. 使用者名稱
|
|
|
|
|
|
|
|
const userEl = container.querySelector('a[href*="/@"] span[translate="no"]'); |
|
|
|
|
|
|
|
const username = userEl ? userEl.innerText.trim() : null; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// 2. 貼文內容 (通常在 x1a6qonq 類別中)
|
|
|
|
|
|
|
|
const textContentEls = container.querySelectorAll('.x1a6qonq span[dir="auto"]'); |
|
|
|
|
|
|
|
const text = Array.from(textContentEls).map(el => el.innerText).join('\n').trim(); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// 3. 發布時間
|
|
|
|
|
|
|
|
const timeEl = container.querySelector('time'); |
|
|
|
|
|
|
|
const published_on = timeEl ? timeEl.getAttribute('datetime') : null; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// 4. 互動數據 (讚、回覆、轉發)
|
|
|
|
|
|
|
|
// 我們搜尋包含特定 aria-label 的 SVG
|
|
|
|
|
|
|
|
const getCountByLabel = (label) => { |
|
|
|
|
|
|
|
const svg = container.querySelector(`svg[aria-label="${label}"]`); |
|
|
|
|
|
|
|
if (!svg) return 0; |
|
|
|
|
|
|
|
// 數據通常在 SVG 旁邊的 span 裡
|
|
|
|
|
|
|
|
const countContainer = svg.closest('div').parentElement.querySelector('span'); |
|
|
|
|
|
|
|
return countContainer ? countContainer.innerText : "0"; |
|
|
|
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
const like_count_str = getCountByLabel("讚"); |
|
|
|
|
|
|
|
const reply_count_str = getCountByLabel("回覆"); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// 5. 貼文 ID / Code (從時間連結中取得)
|
|
|
|
|
|
|
|
const linkEl = container.querySelector('a[href*="/post/"]'); |
|
|
|
|
|
|
|
const postUrl = linkEl ? linkEl.href : null; |
|
|
|
|
|
|
|
const code = postUrl ? postUrl.split('/post/')[1]?.split('/')[0] : null; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (username && (text || code)) { |
|
|
|
|
|
|
|
items.push({ |
|
|
|
|
|
|
|
username, |
|
|
|
|
|
|
|
text, |
|
|
|
|
|
|
|
published_on, |
|
|
|
|
|
|
|
like_count_str, |
|
|
|
|
|
|
|
reply_count_str, |
|
|
|
|
|
|
|
code, |
|
|
|
|
|
|
|
url: postUrl |
|
|
|
|
|
|
|
}); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} catch (err) { |
|
|
|
|
|
|
|
console.error("DOM Parsing error item:", err); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
}); |
|
|
|
|
|
|
|
return items; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
export async function getThread(postUrl, cookies) { |
|
|
|
if (!postUrl?.includes("threads.")) throw new Error("無效的 Threads 網址"); |
|
|
|
if (!postUrl?.includes("threads.")) throw new Error("無效的 Threads 網址"); |
|
|
|
|
|
|
|
|
|
|
|
const postCodeFromUrl = postUrl.split("/post/")[1]?.split("/")[0]; |
|
|
|
const postCodeFromUrl = postUrl.split("/post/")[1]?.split("/")[0]; |
|
|
|
console.log(`[目標鎖定] 貼文代碼: ${postCodeFromUrl}`); |
|
|
|
console.log(`[目標鎖定] 貼文代碼: ${postCodeFromUrl}`); |
|
|
|
|
|
|
|
|
|
|
|
const browser = await puppeteer.launch({ |
|
|
|
const browser = await puppeteer.launch({ |
|
|
|
headless: "new", |
|
|
|
headless: true,
|
|
|
|
args: ['--disable-blink-features=AutomationControlled', '--no-sandbox', '--disable-setuid-sandbox'] |
|
|
|
args: [ |
|
|
|
|
|
|
|
'--disable-blink-features=AutomationControlled',
|
|
|
|
|
|
|
|
'--no-sandbox',
|
|
|
|
|
|
|
|
'--disable-setuid-sandbox', |
|
|
|
|
|
|
|
'--window-size=1920,1080' |
|
|
|
|
|
|
|
], |
|
|
|
|
|
|
|
slowMo: 50
|
|
|
|
}); |
|
|
|
}); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
try { |
|
|
|
try { |
|
|
|
const page = await browser.newPage(); |
|
|
|
const page = await browser.newPage(); |
|
|
|
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'); |
|
|
|
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'); |
|
|
|
|
|
|
|
|
|
|
|
// 增加等待時間確保網路請求啟動
|
|
|
|
if (cookies && cookies.length > 0) { |
|
|
|
await page.goto(postUrl, { waitUntil: 'networkidle2', timeout: 30000 }); |
|
|
|
console.log(`[Auth] 正在注入 ${cookies.length} 個 Session Cookies...`); |
|
|
|
|
|
|
|
try { |
|
|
|
// 滾動是必須的,因為 Relay 串流需要滾動來觸發 JS 解析
|
|
|
|
// 確保每個 cookie 對象至少包含 name, value 以及 domain 或 url
|
|
|
|
await page.evaluate(async () => { |
|
|
|
await browser.setCookie(...cookies); |
|
|
|
await new Promise((resolve) => { |
|
|
|
console.log("[Auth] Cookies 注入成功"); |
|
|
|
let totalHeight = 0; |
|
|
|
// console.log(await browser.cookies());
|
|
|
|
let distance = 500; |
|
|
|
|
|
|
|
let timer = setInterval(() => { |
|
|
|
|
|
|
|
let scrollHeight = document.body.scrollHeight; |
|
|
|
|
|
|
|
window.scrollBy(0, distance); |
|
|
|
|
|
|
|
totalHeight += distance; |
|
|
|
|
|
|
|
if (totalHeight >= scrollHeight || totalHeight > 5000) { |
|
|
|
|
|
|
|
clearInterval(timer); |
|
|
|
|
|
|
|
resolve(); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
}, 100); |
|
|
|
|
|
|
|
}); |
|
|
|
|
|
|
|
}); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// 增加等待時間,讓 Streaming 區塊載入完畢
|
|
|
|
} catch (cookieError) { |
|
|
|
await new Promise(r => setTimeout(r, 5000)); |
|
|
|
console.error("[Auth] Cookies 格式錯誤或注入失敗:", cookieError.message); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
await page.setViewport({ width: 1280, height: 900 }); |
|
|
|
|
|
|
|
|
|
|
|
const allScripts = await page.$$eval( |
|
|
|
console.log("[2/5] 頁面跳轉中..."); |
|
|
|
'script[type="application/json"]', |
|
|
|
await page.goto(postUrl, {
|
|
|
|
(scripts) => scripts.map((s) => s.textContent) |
|
|
|
waitUntil: 'domcontentloaded',
|
|
|
|
); |
|
|
|
timeout: 10000
|
|
|
|
|
|
|
|
}); |
|
|
|
|
|
|
|
|
|
|
|
let allParsedItems = new Map(); |
|
|
|
try { |
|
|
|
|
|
|
|
await page.waitForSelector('main', { timeout: 15000 }); |
|
|
|
|
|
|
|
console.log("[3/5] 主容器已載入"); |
|
|
|
|
|
|
|
} catch (e) { |
|
|
|
|
|
|
|
console.log("[警告] 等待 main 超時,繼續執行..."); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
allScripts.forEach((content) => { |
|
|
|
console.log("[4/5] 開始執行滾動腳本..."); |
|
|
|
if (!content || !content.includes("ScheduledServerJS")) return; |
|
|
|
|
|
|
|
try { |
|
|
|
|
|
|
|
const data = JSON.parse(content); |
|
|
|
|
|
|
|
const rawItems = nestedLookup(data).flat(); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
rawItems.forEach(item => { |
|
|
|
|
|
|
|
if (!item) return; |
|
|
|
|
|
|
|
const parsed = parseThread(item); |
|
|
|
|
|
|
|
if (parsed && parsed.id) { |
|
|
|
|
|
|
|
allParsedItems.set(parsed.id, parsed); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
}); |
|
|
|
|
|
|
|
} catch (e) {} |
|
|
|
|
|
|
|
}); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
const itemsArray = Array.from(allParsedItems.values()); |
|
|
|
console.log("[4/5] 執行模擬滑鼠滾動以加載大量留言..."); |
|
|
|
|
|
|
|
|
|
|
|
const mainThread = itemsArray.find(t => t.code === postCodeFromUrl); |
|
|
|
// 將滑鼠移動到頁面中心以確保滾動事件被正確接收
|
|
|
|
|
|
|
|
await page.mouse.move(960, 540); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// 執行多次模擬滑鼠滾輪滾動
|
|
|
|
|
|
|
|
const totalScrolls = 25; |
|
|
|
|
|
|
|
const scrollDistance = 1000; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// 獲取初始高度
|
|
|
|
|
|
|
|
let lastHeight = await page.evaluate(() => document.body.scrollHeight); |
|
|
|
|
|
|
|
console.log(` - 初始頁面高度: ${lastHeight}px`); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for (let i = 0; i < totalScrolls; i++) { |
|
|
|
|
|
|
|
await page.mouse.wheel({ deltaY: scrollDistance }); |
|
|
|
|
|
|
|
await new Promise(r => setTimeout(r, 1500)); // 稍微增加等待時間讓 GraphQL 有反應
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// 每 5 次滾動做一次大幅度動作
|
|
|
|
|
|
|
|
if (i % 5 === 0 && i !== 0) { |
|
|
|
|
|
|
|
await page.mouse.wheel({ deltaY: -600 }); |
|
|
|
|
|
|
|
await new Promise(r => setTimeout(r, 500)); |
|
|
|
|
|
|
|
await page.mouse.wheel({ deltaY: 800 }); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// 檢查高度變化
|
|
|
|
|
|
|
|
const currentHeight = await page.evaluate(() => document.body.scrollHeight); |
|
|
|
|
|
|
|
if (currentHeight > lastHeight) { |
|
|
|
|
|
|
|
console.log(` - 第 ${i + 1} 次滾動: 偵測到內容加載!高度增加 ${currentHeight - lastHeight}px (目前: ${currentHeight}px)`); |
|
|
|
|
|
|
|
lastHeight = currentHeight; |
|
|
|
|
|
|
|
} else { |
|
|
|
|
|
|
|
console.log(` - 第 ${i + 1} 次滾動: 高度未變化 (${currentHeight}px)`); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
console.log("[5/5] 等待數據流穩定 (10s)..."); |
|
|
|
|
|
|
|
await new Promise(r => setTimeout(r, 10000)); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
const rawItems = await page.evaluate(scrapeThreadDataFromDOM); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// 後處理數據:轉換數字與區分主文/回覆
|
|
|
|
|
|
|
|
const processedItems = rawItems.map(item => ({ |
|
|
|
|
|
|
|
...item, |
|
|
|
|
|
|
|
like_count: (item.like_count_str), |
|
|
|
|
|
|
|
reply_count: (item.reply_count_str), |
|
|
|
|
|
|
|
published_at: item.published_on ? new Date(item.published_on).getTime() / 1000 : null |
|
|
|
|
|
|
|
})); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
const mainThread = processedItems.find(t => t.code === postCodeFromUrl); |
|
|
|
|
|
|
|
const replies = processedItems.filter(t => t.code !== postCodeFromUrl); |
|
|
|
|
|
|
|
|
|
|
|
if (!mainThread) { |
|
|
|
if (!mainThread) { |
|
|
|
throw new Error("無法定位主貼文。這通常是因為 Auth 阻擋或頁面未完全渲染。"); |
|
|
|
console.log("[警告] DOM 中找不到主貼文,嘗試抓取第一個項目作為主文"); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
console.log(`[成功] 作者: ${mainThread?.username || '未知'},回覆數: ${replies.length}`); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// const allScripts = await page.$$eval(
|
|
|
|
|
|
|
|
// 'script[type="application/json"]',
|
|
|
|
|
|
|
|
// (scripts) => scripts.map((s) => s.textContent)
|
|
|
|
|
|
|
|
// );
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// console.log(`[解析] 成功掃描到 ${allScripts.length} 個 JSON 區塊`);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// let allParsedItems = new Map();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// allScripts.forEach((content) => {
|
|
|
|
|
|
|
|
// if (!content || !content.includes("ScheduledServerJS")) return;
|
|
|
|
|
|
|
|
// try {
|
|
|
|
|
|
|
|
// const data = JSON.parse(content);
|
|
|
|
|
|
|
|
// const rawItems = nestedLookup(data).flat();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// rawItems.forEach(item => {
|
|
|
|
|
|
|
|
// if (!item) return;
|
|
|
|
|
|
|
|
// const parsed = parseThread(item);
|
|
|
|
|
|
|
|
// if (parsed && parsed.id) {
|
|
|
|
|
|
|
|
// allParsedItems.set(parsed.id, parsed);
|
|
|
|
|
|
|
|
// }
|
|
|
|
|
|
|
|
// });
|
|
|
|
|
|
|
|
// } catch (e) {}
|
|
|
|
|
|
|
|
// });
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// const itemsArray = Array.from(allParsedItems.values());
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// const mainThread = itemsArray.find(t => t.code === postCodeFromUrl);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// if (!mainThread) {
|
|
|
|
|
|
|
|
// throw new Error("無法定位主貼文。這通常是因為 Auth 阻擋或頁面未完全渲染。");
|
|
|
|
|
|
|
|
// }
|
|
|
|
|
|
|
|
|
|
|
|
const authorName = mainThread.username; |
|
|
|
// const authorName = mainThread.username;
|
|
|
|
const replies = itemsArray.filter(t =>
|
|
|
|
// const replies = itemsArray.filter(t =>
|
|
|
|
t.code !== postCodeFromUrl |
|
|
|
// t.code !== postCodeFromUrl
|
|
|
|
&& mainThread.id.includes(t.parent_post_id) |
|
|
|
// && mainThread.id.includes(t.parent_post_id)
|
|
|
|
).sort((a, b) => a.published_on - b.published_on); |
|
|
|
// ).sort((a, b) => a.published_on - b.published_on);
|
|
|
|
|
|
|
|
|
|
|
|
console.log(`[解析成功] 找到主貼文,作者: ${authorName},回覆數量: ${replies.length}`); |
|
|
|
// console.log(`[解析成功] 找到主貼文,作者: ${authorName},回覆數量: ${replies.length}`);
|
|
|
|
|
|
|
|
|
|
|
|
return { thread: mainThread, replies }; |
|
|
|
return { thread: mainThread, replies }; |
|
|
|
|
|
|
|
|
|
|
|
|