update scrapper

main
reng 4 weeks ago
parent c11f9b05cb
commit a3a24c67fc
  1. 2
      Assets/Qdrant/storage/collections/data-v1/0/newest_clocks.json
  2. 2
      Assets/Qdrant/storage/collections/data-v2/0/newest_clocks.json
  3. 2
      Assets/Qdrant/storage/collections/data-v3/0/newest_clocks.json
  4. 2
      Assets/Qdrant/storage/collections/theme-v1/0/newest_clocks.json
  5. 33
      v2/scrapper/main.js
  6. 232
      v2/scrapper/scrapper.js
  7. 13
      v2/scrapper/search.js

@ -1 +1 @@
{"clocks":[{"peer_id":2200039024721645,"clock_id":6,"current_tick":1,"token":9225203475475803001},{"peer_id":2200039024721645,"clock_id":2,"current_tick":13,"token":18171945371147883509},{"peer_id":2200039024721645,"clock_id":0,"current_tick":31,"token":14274923382223712766},{"peer_id":2200039024721645,"clock_id":1,"current_tick":19,"token":13530503213804333634},{"peer_id":2200039024721645,"clock_id":4,"current_tick":3,"token":3225682797052271524},{"peer_id":2200039024721645,"clock_id":5,"current_tick":3,"token":16737324663161229622},{"peer_id":2200039024721645,"clock_id":3,"current_tick":5,"token":12159487072447257415}]} {"clocks":[{"peer_id":2200039024721645,"clock_id":2,"current_tick":13,"token":18171945371147883509},{"peer_id":2200039024721645,"clock_id":0,"current_tick":31,"token":14274923382223712766},{"peer_id":2200039024721645,"clock_id":1,"current_tick":19,"token":13530503213804333634},{"peer_id":2200039024721645,"clock_id":6,"current_tick":1,"token":9225203475475803001},{"peer_id":2200039024721645,"clock_id":3,"current_tick":5,"token":12159487072447257415},{"peer_id":2200039024721645,"clock_id":4,"current_tick":3,"token":3225682797052271524},{"peer_id":2200039024721645,"clock_id":5,"current_tick":3,"token":16737324663161229622}]}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

@ -1 +1 @@
{"clocks":[{"peer_id":2200039024721645,"clock_id":0,"current_tick":9,"token":9315707229535449938},{"peer_id":2200039024721645,"clock_id":1,"current_tick":1,"token":18111163424196140885}]} {"clocks":[{"peer_id":2200039024721645,"clock_id":1,"current_tick":1,"token":18111163424196140885},{"peer_id":2200039024721645,"clock_id":0,"current_tick":9,"token":9315707229535449938}]}

@ -11,17 +11,29 @@ const cookies = [
{ {
name: 'sessionid', name: 'sessionid',
value: '64605724719%3ALlZCmwghVyOAck%3A23%3AAYhptDoKttkGRWkpa5583neohBfLXlGfOlwPPmdP1w', value: '64605724719%3ALlZCmwghVyOAck%3A23%3AAYhptDoKttkGRWkpa5583neohBfLXlGfOlwPPmdP1w',
domain: '.threads.com' domain: '.threads.com',
path:'/',
httpOnly:true,
secure:true,
expires: Math.floor(Date.now()/1000)+60*60*24*365,
}, },
{ {
name: 'ds_user_id', name: 'ds_user_id',
value: '64605724719', value: '64605724719',
domain: '.threads.com' domain: '.threads.com',
path:'/',
httpOnly:true,
secure:true,
expires: Math.floor(Date.now()/1000)+60*60*24*365,
}, },
{ {
name:'csrftoken', name:'csrftoken',
value:'SI5YedKIeuSAgAbdtfynUwzrmSAGquxH', value:'SI5YedKIeuSAgAbdtfynUwzrmSAGquxH',
domain:'.threads.com' domain: '.threads.com',
path:'/',
httpOnly:true,
secure:true,
expires: Math.floor(Date.now()/1000)+60*60*24*365,
} }
]; ];
@ -41,23 +53,28 @@ const Keywords=[
const Version="v2-1"; const Version="v2-1";
const DEBUG_MODE=true; const DEBUG_MODE=true;
const SCRAP_TYPE='TAG'; // 'KEYWORD' or 'TAG'
async function step1(){ async function step1(){
// const chooseKeywords=['人生 妥協','邊界 被侵犯','卡住 職涯','尷尬 年齡','厭世 創作']; // const chooseKeywords=['人生 妥協','邊界 被侵犯','卡住 職涯','尷尬 年齡','厭世 創作'];
const chooseThemse=['職涯','00後整頓職場','生活方式','長大後才懂的事','privilege']; const chooseThemse=['職涯','00後整頓職場','生活方式','長大後才懂的事','privilege'];
const chooseKeywords=['成年 限制 學貸 獨立','學貸 房租 焦慮','職場 權利 privilege','職涯 嚮往','門禁 自由 限制','人生 妥協 期待','人生 自主']; const chooseKeywords=['成年 限制 學貸 獨立','學貸 房租 焦慮','職場 權利 privilege','職涯 嚮往','門禁 自由 限制','人生 妥協 期待','人生 自主'];
let select=SCRAP_TYPE==='KEYWORD'? chooseKeywords : chooseThemse;
// const keyword="工作 生活 平衡"; // const keyword="工作 生活 平衡";
for(const keyword of chooseKeywords){ for(const keyword of select){
if(DEBUG_MODE){
if(keyword!==chooseKeywords[0] && SCRAP_TYPE==='KEYWORD') continue;
if(keyword!==chooseThemse[0] && SCRAP_TYPE==='TAG') continue;
}
// const searchResults = await searchThreads(keyword, 50, cookies, 'TAG'); // const searchResults = await searchThreads(keyword, 50, cookies, 'TAG');
const searchResults = await searchThreads(keyword, 50, cookies, 'KEYWORD'); const searchResults = await searchThreads(keyword, 50, cookies, SCRAP_TYPE);
console.log(JSON.stringify(searchResults)); console.log(JSON.stringify(searchResults));
for(const url of searchResults.urls){ for(const url of searchResults.urls){
try{ try{
const threadItems=await getThread(url); const threadItems=await getThread(url, cookies);
// console.log(JSON.stringify(threadItems)); // console.log(JSON.stringify(threadItems));
// save to filesystem // save to filesystem
@ -165,6 +182,8 @@ async function step3(){
async function main(){ async function main(){
await step1(); await step1();
// await getThread('https://www.threads.com/@pytteliten_/post/DJpeh8BoO3a', cookies);
// await step2(); // await step2();
// await step3(); // await step3();

@ -105,83 +105,209 @@ function parseThread(data) {
return result.id ? result : null; return result.id ? result : null;
} }
async function scrapeThreadDataFromDOM() {
const items = [];
// Threads 的每一則貼文/回覆通常封裝在 data-pressable-container="true" 的 div 中
const containers = document.querySelectorAll('div[data-pressable-container="true"]');
export async function getThread(postUrl) { containers.forEach(container => {
try {
// 1. 使用者名稱
const userEl = container.querySelector('a[href*="/@"] span[translate="no"]');
const username = userEl ? userEl.innerText.trim() : null;
// 2. 貼文內容 (通常在 x1a6qonq 類別中)
const textContentEls = container.querySelectorAll('.x1a6qonq span[dir="auto"]');
const text = Array.from(textContentEls).map(el => el.innerText).join('\n').trim();
// 3. 發布時間
const timeEl = container.querySelector('time');
const published_on = timeEl ? timeEl.getAttribute('datetime') : null;
// 4. 互動數據 (讚、回覆、轉發)
// 我們搜尋包含特定 aria-label 的 SVG
const getCountByLabel = (label) => {
const svg = container.querySelector(`svg[aria-label="${label}"]`);
if (!svg) return 0;
// 數據通常在 SVG 旁邊的 span 裡
const countContainer = svg.closest('div').parentElement.querySelector('span');
return countContainer ? countContainer.innerText : "0";
};
const like_count_str = getCountByLabel("讚");
const reply_count_str = getCountByLabel("回覆");
// 5. 貼文 ID / Code (從時間連結中取得)
const linkEl = container.querySelector('a[href*="/post/"]');
const postUrl = linkEl ? linkEl.href : null;
const code = postUrl ? postUrl.split('/post/')[1]?.split('/')[0] : null;
if (username && (text || code)) {
items.push({
username,
text,
published_on,
like_count_str,
reply_count_str,
code,
url: postUrl
});
}
} catch (err) {
console.error("DOM Parsing error item:", err);
}
});
return items;
}
export async function getThread(postUrl, cookies) {
if (!postUrl?.includes("threads.")) throw new Error("無效的 Threads 網址"); if (!postUrl?.includes("threads.")) throw new Error("無效的 Threads 網址");
const postCodeFromUrl = postUrl.split("/post/")[1]?.split("/")[0]; const postCodeFromUrl = postUrl.split("/post/")[1]?.split("/")[0];
console.log(`[目標鎖定] 貼文代碼: ${postCodeFromUrl}`); console.log(`[目標鎖定] 貼文代碼: ${postCodeFromUrl}`);
const browser = await puppeteer.launch({ const browser = await puppeteer.launch({
headless: "new", headless: true,
args: ['--disable-blink-features=AutomationControlled', '--no-sandbox', '--disable-setuid-sandbox'] args: [
'--disable-blink-features=AutomationControlled',
'--no-sandbox',
'--disable-setuid-sandbox',
'--window-size=1920,1080'
],
slowMo: 50
}); });
try { try {
const page = await browser.newPage(); const page = await browser.newPage();
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'); await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
// 增加等待時間確保網路請求啟動 if (cookies && cookies.length > 0) {
await page.goto(postUrl, { waitUntil: 'networkidle2', timeout: 30000 }); console.log(`[Auth] 正在注入 ${cookies.length} 個 Session Cookies...`);
try {
// 滾動是必須的,因為 Relay 串流需要滾動來觸發 JS 解析 // 確保每個 cookie 對象至少包含 name, value 以及 domain 或 url
await page.evaluate(async () => { await browser.setCookie(...cookies);
await new Promise((resolve) => { console.log("[Auth] Cookies 注入成功");
let totalHeight = 0; // console.log(await browser.cookies());
let distance = 500;
let timer = setInterval(() => {
let scrollHeight = document.body.scrollHeight;
window.scrollBy(0, distance);
totalHeight += distance;
if (totalHeight >= scrollHeight || totalHeight > 5000) {
clearInterval(timer);
resolve();
}
}, 100);
});
});
// 增加等待時間,讓 Streaming 區塊載入完畢 } catch (cookieError) {
await new Promise(r => setTimeout(r, 5000)); console.error("[Auth] Cookies 格式錯誤或注入失敗:", cookieError.message);
}
}
await page.setViewport({ width: 1280, height: 900 });
const allScripts = await page.$$eval( console.log("[2/5] 頁面跳轉中...");
'script[type="application/json"]', await page.goto(postUrl, {
(scripts) => scripts.map((s) => s.textContent) waitUntil: 'domcontentloaded',
); timeout: 10000
});
let allParsedItems = new Map(); try {
await page.waitForSelector('main', { timeout: 15000 });
console.log("[3/5] 主容器已載入");
} catch (e) {
console.log("[警告] 等待 main 超時,繼續執行...");
}
allScripts.forEach((content) => { console.log("[4/5] 開始執行滾動腳本...");
if (!content || !content.includes("ScheduledServerJS")) return;
try {
const data = JSON.parse(content);
const rawItems = nestedLookup(data).flat();
rawItems.forEach(item => {
if (!item) return;
const parsed = parseThread(item);
if (parsed && parsed.id) {
allParsedItems.set(parsed.id, parsed);
}
});
} catch (e) {}
});
const itemsArray = Array.from(allParsedItems.values()); console.log("[4/5] 執行模擬滑鼠滾動以加載大量留言...");
const mainThread = itemsArray.find(t => t.code === postCodeFromUrl); // 將滑鼠移動到頁面中心以確保滾動事件被正確接收
await page.mouse.move(960, 540);
// 執行多次模擬滑鼠滾輪滾動
const totalScrolls = 25;
const scrollDistance = 1000;
// 獲取初始高度
let lastHeight = await page.evaluate(() => document.body.scrollHeight);
console.log(` - 初始頁面高度: ${lastHeight}px`);
for (let i = 0; i < totalScrolls; i++) {
await page.mouse.wheel({ deltaY: scrollDistance });
await new Promise(r => setTimeout(r, 1500)); // 稍微增加等待時間讓 GraphQL 有反應
// 每 5 次滾動做一次大幅度動作
if (i % 5 === 0 && i !== 0) {
await page.mouse.wheel({ deltaY: -600 });
await new Promise(r => setTimeout(r, 500));
await page.mouse.wheel({ deltaY: 800 });
}
// 檢查高度變化
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
if (currentHeight > lastHeight) {
console.log(` - 第 ${i + 1} 次滾動: 偵測到內容加載!高度增加 ${currentHeight - lastHeight}px (目前: ${currentHeight}px)`);
lastHeight = currentHeight;
} else {
console.log(` - 第 ${i + 1} 次滾動: 高度未變化 (${currentHeight}px)`);
}
}
console.log("[5/5] 等待數據流穩定 (10s)...");
await new Promise(r => setTimeout(r, 10000));
const rawItems = await page.evaluate(scrapeThreadDataFromDOM);
// 後處理數據:轉換數字與區分主文/回覆
const processedItems = rawItems.map(item => ({
...item,
like_count: (item.like_count_str),
reply_count: (item.reply_count_str),
published_at: item.published_on ? new Date(item.published_on).getTime() / 1000 : null
}));
const mainThread = processedItems.find(t => t.code === postCodeFromUrl);
const replies = processedItems.filter(t => t.code !== postCodeFromUrl);
if (!mainThread) { if (!mainThread) {
throw new Error("無法定位主貼文。這通常是因為 Auth 阻擋或頁面未完全渲染。"); console.log("[警告] DOM 中找不到主貼文,嘗試抓取第一個項目作為主文");
} }
console.log(`[成功] 作者: ${mainThread?.username || '未知'},回覆數: ${replies.length}`);
// const allScripts = await page.$$eval(
// 'script[type="application/json"]',
// (scripts) => scripts.map((s) => s.textContent)
// );
// console.log(`[解析] 成功掃描到 ${allScripts.length} 個 JSON 區塊`);
// let allParsedItems = new Map();
// allScripts.forEach((content) => {
// if (!content || !content.includes("ScheduledServerJS")) return;
// try {
// const data = JSON.parse(content);
// const rawItems = nestedLookup(data).flat();
// rawItems.forEach(item => {
// if (!item) return;
// const parsed = parseThread(item);
// if (parsed && parsed.id) {
// allParsedItems.set(parsed.id, parsed);
// }
// });
// } catch (e) {}
// });
// const itemsArray = Array.from(allParsedItems.values());
// const mainThread = itemsArray.find(t => t.code === postCodeFromUrl);
// if (!mainThread) {
// throw new Error("無法定位主貼文。這通常是因為 Auth 阻擋或頁面未完全渲染。");
// }
const authorName = mainThread.username; // const authorName = mainThread.username;
const replies = itemsArray.filter(t => // const replies = itemsArray.filter(t =>
t.code !== postCodeFromUrl // t.code !== postCodeFromUrl
&& mainThread.id.includes(t.parent_post_id) // && mainThread.id.includes(t.parent_post_id)
).sort((a, b) => a.published_on - b.published_on); // ).sort((a, b) => a.published_on - b.published_on);
console.log(`[解析成功] 找到主貼文,作者: ${authorName},回覆數量: ${replies.length}`); // console.log(`[解析成功] 找到主貼文,作者: ${authorName},回覆數量: ${replies.length}`);
return { thread: mainThread, replies }; return { thread: mainThread, replies };

@ -38,10 +38,17 @@ export async function searchThreads(keyword, limit = 20, cookies = [], searchMod
'Accept-Language': 'en-US,en;q=0.9', 'Accept-Language': 'en-US,en;q=0.9',
}); });
// Inject cookies if provided to bypass login wall
if (cookies && cookies.length > 0) { if (cookies && cookies.length > 0) {
console.log("[Auth] Injecting session cookies..."); console.log(`[Auth] 正在注入 ${cookies.length} 個 Session Cookies...`);
await page.setCookie(...cookies); try {
// 確保每個 cookie 對象至少包含 name, value 以及 domain 或 url
await browser.setCookie(...cookies);
console.log("[Auth] Cookies 注入成功");
// console.log(await browser.cookies());
} catch (cookieError) {
console.error("[Auth] Cookies 格式錯誤或注入失敗:", cookieError.message);
}
} }
await page.setViewport({ width: 1280, height: 900 }); await page.setViewport({ width: 1280, height: 900 });

Loading…
Cancel
Save