parent
503ed29ffd
commit
d8bb02058d
9 changed files with 1491 additions and 5 deletions
@ -1,2 +1,4 @@ |
|||||||
/v1/node/node_modules |
/v1/node/node_modules |
||||||
/v1/python/cache |
/v1/python/cache |
||||||
|
/v2/scrapper/node_modules |
||||||
|
/v2/scrapper/scrapped |
||||||
|
|||||||
@ -0,0 +1,48 @@ |
|||||||
|
import { getThread } from "./scrapper.js"; |
||||||
|
import { searchThreads } from "./search.js"; |
||||||
|
import { writeFileSync } from "fs"; |
||||||
|
|
||||||
|
const cookies = [ |
||||||
|
{ |
||||||
|
name: 'sessionid', |
||||||
|
value: '64605724719%3ALlZCmwghVyOAck%3A23%3AAYhptDoKttkGRWkpa5583neohBfLXlGfOlwPPmdP1w', |
||||||
|
domain: '.threads.com' |
||||||
|
}, |
||||||
|
{ |
||||||
|
name: 'ds_user_id', |
||||||
|
value: '64605724719', |
||||||
|
domain: '.threads.com' |
||||||
|
}, |
||||||
|
{ |
||||||
|
name:'csrftoken', |
||||||
|
value:'SI5YedKIeuSAgAbdtfynUwzrmSAGquxH', |
||||||
|
domain:'.threads.com' |
||||||
|
} |
||||||
|
]; |
||||||
|
|
||||||
|
async function main(){ |
||||||
|
|
||||||
|
const searchResults = await searchThreads("厭世", 20, cookies); |
||||||
|
console.log(JSON.stringify(searchResults)); |
||||||
|
|
||||||
|
for(const url of searchResults.urls){ |
||||||
|
try{ |
||||||
|
const threadItems=await getThread(url); |
||||||
|
console.log(JSON.stringify(threadItems)); |
||||||
|
|
||||||
|
// save to filesystem
|
||||||
|
const fileName=url.split("/post/")[1].split("/")[0]; |
||||||
|
writeFileSync(`scrapped/${fileName}.json`, JSON.stringify(threadItems, null, 2) ); |
||||||
|
} catch(err){ |
||||||
|
console.error("Error processing", url, ":", err); |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
// const threadUrl="https://www.threads.com/@bobolove0828/post/DQ63pgpklmi";
|
||||||
|
// const threadItems=await getThread(threadUrl, cookies);
|
||||||
|
// console.log(JSON.stringify(threadItems));
|
||||||
|
// const fileName=threadUrl.split("/post/")[1].split("/")[0];
|
||||||
|
// writeFileSync(`scrapped/${fileName}.json`, JSON.stringify(threadItems, null, 2) );
|
||||||
|
} |
||||||
|
|
||||||
|
main(); |
||||||
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,6 @@ |
|||||||
|
{ |
||||||
|
"type":"module", |
||||||
|
"dependencies": { |
||||||
|
"puppeteer": "^24.34.0" |
||||||
|
} |
||||||
|
} |
||||||
@ -0,0 +1,187 @@ |
|||||||
|
import puppeteer from 'puppeteer'; |
||||||
|
|
||||||
|
/** |
||||||
|
* 極限深度查找 (Ultimate Meta Relay Lookup) |
||||||
|
* 專門針對 ScheduledServerJS -> __bbox -> RelayPrefetchedStreamCache 的參數結構 |
||||||
|
*/ |
||||||
|
function nestedLookup(obj, keys = ["thread_items", "replies", "child_posts", "threaded_replies", "post", "posts", "edges", "node", "reply_threads"]) { |
||||||
|
let results = []; |
||||||
|
if (typeof obj !== "object" || obj === null) return results; |
||||||
|
|
||||||
|
if (Array.isArray(obj)) { |
||||||
|
// 關鍵修正:檢查這是否為 RelayPrefetchedStreamCache 的呼叫陣列
|
||||||
|
// 格式通常是 ["RelayPrefetchedStreamCache", "next", null, ["key", {數據物件}]]
|
||||||
|
if (obj[0] === "RelayPrefetchedStreamCache" && Array.isArray(obj[3])) { |
||||||
|
const relayData = obj[3][1]; // 取得包含數據的第二個參數
|
||||||
|
if (relayData) results = results.concat(nestedLookup(relayData, keys)); |
||||||
|
} |
||||||
|
|
||||||
|
for (const item of obj) { |
||||||
|
results = results.concat(nestedLookup(item, keys)); |
||||||
|
} |
||||||
|
} else { |
||||||
|
// 處理 __bbox 封裝結構
|
||||||
|
if (obj.__bbox?.require) { |
||||||
|
results = results.concat(nestedLookup(obj.__bbox.require, keys)); |
||||||
|
} |
||||||
|
if (obj.__bbox?.define) { |
||||||
|
results = results.concat(nestedLookup(obj.__bbox.define, keys)); |
||||||
|
} |
||||||
|
|
||||||
|
for (const k in obj) { |
||||||
|
// 匹配目標鍵值
|
||||||
|
if (keys.includes(k) && obj[k] !== null) { |
||||||
|
// 如果是列表結構,進一步遞迴
|
||||||
|
if ((k === "edges" || k === "thread_items" || k === "reply_threads") && Array.isArray(obj[k])) { |
||||||
|
results = results.concat(nestedLookup(obj[k], keys)); |
||||||
|
} else { |
||||||
|
results.push(obj[k]); |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
// 繼續深度遍歷
|
||||||
|
if (typeof obj[k] === "object" && obj[k] !== null) { |
||||||
|
results = results.concat(nestedLookup(obj[k], keys)); |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
return results; |
||||||
|
} |
||||||
|
|
||||||
|
/** |
||||||
|
* 解析單一貼文資料 (支援更多 GraphQL 變體路徑) |
||||||
|
*/ |
||||||
|
function parseThread(data) { |
||||||
|
if (!data) return null; |
||||||
|
|
||||||
|
// 遍歷所有可能的 Post 容器路徑
|
||||||
|
let post = data.post || data.node?.post || data.thread_items?.[0]?.post || (data.posts && data.posts[0]) || (data.caption ? data : null); |
||||||
|
|
||||||
|
// 處理 GraphQL result.data 結構
|
||||||
|
if (!post && data.result?.data) { |
||||||
|
const d = data.result.data; |
||||||
|
post = d.text_post_app_info || d.post || d; |
||||||
|
} |
||||||
|
|
||||||
|
if (!post || (!post.id && !post.pk)) return null; |
||||||
|
|
||||||
|
const mediaType = post.media_type; |
||||||
|
let images = []; |
||||||
|
let videos = []; |
||||||
|
let videoThumbnail = null; |
||||||
|
|
||||||
|
if (mediaType === 2 && post.video_versions?.length > 0) { |
||||||
|
const highestResVideo = post.video_versions.reduce((max, cur) => (cur.width > max.width ? cur : max), post.video_versions[0]); |
||||||
|
if (highestResVideo?.url) videos.push(highestResVideo.url); |
||||||
|
if (post.image_versions2?.candidates?.length > 0) videoThumbnail = post.image_versions2.candidates[0].url; |
||||||
|
}
|
||||||
|
else if (mediaType === 1 && post.image_versions2?.candidates?.length > 0) { |
||||||
|
const highestResImage = post.image_versions2.candidates.reduce((max, cur) => (cur.width > max.width ? cur : max), post.image_versions2.candidates[0]); |
||||||
|
if (highestResImage?.url) images.push(highestResImage.url); |
||||||
|
}
|
||||||
|
else if (mediaType === 8 && post.carousel_media?.length > 0) { |
||||||
|
post.carousel_media.forEach(m => { |
||||||
|
if (m.media_type === 1 && m.image_versions2?.candidates?.length > 0) images.push(m.image_versions2.candidates[0].url); |
||||||
|
if (m.media_type === 2 && m.video_versions?.length > 0) videos.push(m.video_versions[0].url); |
||||||
|
}); |
||||||
|
} |
||||||
|
|
||||||
|
const result = { |
||||||
|
text: post.caption?.text || post.text_post_app_info?.share_info?.quoted_post?.caption?.text || "", |
||||||
|
published_on: post.taken_at, |
||||||
|
id: post.id || post.pk, |
||||||
|
code: post.code, |
||||||
|
username: post.user?.username, |
||||||
|
// user_pic: post.user?.profile_pic_url,
|
||||||
|
like_count: post.like_count || 0, |
||||||
|
reply_count: post.direct_reply_count || post.reply_count || 0, |
||||||
|
// images,
|
||||||
|
// videos,
|
||||||
|
// video_thumbnail: videoThumbnail,
|
||||||
|
// url: post.user?.username && post.code ? `https://www.threads.net/@${post.user.username}/post/${post.code}` : null
|
||||||
|
}; |
||||||
|
|
||||||
|
return result.id ? result : null; |
||||||
|
} |
||||||
|
|
||||||
|
export async function getThread(postUrl) { |
||||||
|
if (!postUrl?.includes("threads.")) throw new Error("無效的 Threads 網址"); |
||||||
|
|
||||||
|
const postCodeFromUrl = postUrl.split("/post/")[1]?.split("/")[0]; |
||||||
|
console.log(`[目標鎖定] 貼文代碼: ${postCodeFromUrl}`); |
||||||
|
|
||||||
|
const browser = await puppeteer.launch({ |
||||||
|
headless: "new", |
||||||
|
args: ['--disable-blink-features=AutomationControlled', '--no-sandbox', '--disable-setuid-sandbox'] |
||||||
|
}); |
||||||
|
|
||||||
|
try { |
||||||
|
const page = await browser.newPage(); |
||||||
|
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'); |
||||||
|
|
||||||
|
// 增加等待時間確保網路請求啟動
|
||||||
|
await page.goto(postUrl, { waitUntil: 'networkidle2', timeout: 30000 }); |
||||||
|
|
||||||
|
// 滾動是必須的,因為 Relay 串流需要滾動來觸發 JS 解析
|
||||||
|
await page.evaluate(async () => { |
||||||
|
await new Promise((resolve) => { |
||||||
|
let totalHeight = 0; |
||||||
|
let distance = 500; |
||||||
|
let timer = setInterval(() => { |
||||||
|
let scrollHeight = document.body.scrollHeight; |
||||||
|
window.scrollBy(0, distance); |
||||||
|
totalHeight += distance; |
||||||
|
if (totalHeight >= scrollHeight || totalHeight > 5000) { |
||||||
|
clearInterval(timer); |
||||||
|
resolve(); |
||||||
|
} |
||||||
|
}, 100); |
||||||
|
}); |
||||||
|
}); |
||||||
|
|
||||||
|
// 增加等待時間,讓 Streaming 區塊載入完畢
|
||||||
|
await new Promise(r => setTimeout(r, 5000)); |
||||||
|
|
||||||
|
const allScripts = await page.$$eval( |
||||||
|
'script[type="application/json"]', |
||||||
|
(scripts) => scripts.map((s) => s.textContent) |
||||||
|
); |
||||||
|
|
||||||
|
let allParsedItems = new Map(); |
||||||
|
|
||||||
|
allScripts.forEach((content) => { |
||||||
|
if (!content || !content.includes("ScheduledServerJS")) return; |
||||||
|
try { |
||||||
|
const data = JSON.parse(content); |
||||||
|
const rawItems = nestedLookup(data).flat(); |
||||||
|
|
||||||
|
rawItems.forEach(item => { |
||||||
|
if (!item) return; |
||||||
|
const parsed = parseThread(item); |
||||||
|
if (parsed && parsed.id) { |
||||||
|
allParsedItems.set(parsed.id, parsed); |
||||||
|
} |
||||||
|
}); |
||||||
|
} catch (e) {} |
||||||
|
}); |
||||||
|
|
||||||
|
const itemsArray = Array.from(allParsedItems.values()); |
||||||
|
const mainThread = itemsArray.find(t => t.code === postCodeFromUrl); |
||||||
|
|
||||||
|
if (!mainThread) { |
||||||
|
throw new Error("無法定位主貼文。這通常是因為 Auth 阻擋或頁面未完全渲染。"); |
||||||
|
} |
||||||
|
|
||||||
|
const authorName = mainThread.username; |
||||||
|
const replies = itemsArray.filter(t =>
|
||||||
|
t.code !== postCodeFromUrl |
||||||
|
).sort((a, b) => a.published_on - b.published_on); |
||||||
|
|
||||||
|
console.log(`[解析成功] 找到主貼文,作者: ${authorName},回覆數量: ${replies.length}`); |
||||||
|
|
||||||
|
return { thread: mainThread, replies }; |
||||||
|
|
||||||
|
} finally { |
||||||
|
await browser.close(); |
||||||
|
} |
||||||
|
} |
||||||
@ -0,0 +1,114 @@ |
|||||||
|
import puppeteer from 'puppeteer'; |
||||||
|
|
||||||
|
/** |
||||||
|
* Threads Search Scraper |
||||||
|
* Navigates to the search page, enters a keyword, and extracts post URLs. |
||||||
|
* * @param {string} keyword - The search term. |
||||||
|
* @param {number} limit - Maximum number of URLs to return. |
||||||
|
* @param {Array} cookies - Optional: Array of Puppeteer-formatted cookies to handle login. |
||||||
|
*/ |
||||||
|
export async function searchThreads(keyword, limit = 20, cookies = []) { |
||||||
|
if (!keyword) { |
||||||
|
throw new Error("Please provide a keyword for the search."); |
||||||
|
} |
||||||
|
|
||||||
|
const searchUrl = `https://www.threads.net/search?q=${encodeURIComponent(keyword)}&serp_type=default`; |
||||||
|
console.log(`[Search Start] Keyword: "${keyword}" | URL: ${searchUrl}`); |
||||||
|
|
||||||
|
const browser = await puppeteer.launch({ |
||||||
|
headless: "new", |
||||||
|
args: [ |
||||||
|
'--disable-blink-features=AutomationControlled', |
||||||
|
'--no-sandbox', |
||||||
|
'--disable-setuid-sandbox' |
||||||
|
] |
||||||
|
}); |
||||||
|
|
||||||
|
try { |
||||||
|
const page = await browser.newPage(); |
||||||
|
|
||||||
|
// Anti-detection headers
|
||||||
|
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'); |
||||||
|
await page.setExtraHTTPHeaders({ |
||||||
|
'Accept-Language': 'en-US,en;q=0.9', |
||||||
|
}); |
||||||
|
|
||||||
|
// Inject cookies if provided to bypass login wall
|
||||||
|
if (cookies && cookies.length > 0) { |
||||||
|
console.log("[Auth] Injecting session cookies..."); |
||||||
|
await page.setCookie(...cookies); |
||||||
|
} |
||||||
|
|
||||||
|
await page.setViewport({ width: 1280, height: 900 }); |
||||||
|
|
||||||
|
// Navigate to search results
|
||||||
|
const response = await page.goto(searchUrl, { waitUntil: 'networkidle2', timeout: 60000 }); |
||||||
|
|
||||||
|
// Check if we were redirected to the login page
|
||||||
|
const currentUrl = page.url(); |
||||||
|
if (currentUrl.includes('/login')) { |
||||||
|
console.error("[Auth Error] Threads redirected to login. You must provide valid session cookies."); |
||||||
|
throw new Error("Authentication required: Threads search is restricted to logged-in users."); |
||||||
|
} |
||||||
|
|
||||||
|
// Wait for the results to start appearing
|
||||||
|
try { |
||||||
|
// Threads search result container or post links
|
||||||
|
await page.waitForSelector('a[href*="/post/"]', { timeout: 20000 }); |
||||||
|
} catch (e) { |
||||||
|
console.warn("[Warning] Results did not load. This might be a login wall or no results found."); |
||||||
|
// Take a screenshot for debugging if needed (local environment only)
|
||||||
|
// await page.screenshot({ path: 'search_debug.png' });
|
||||||
|
} |
||||||
|
|
||||||
|
// Scroll logic to gather more results
|
||||||
|
await page.evaluate(async (maxItems) => { |
||||||
|
await new Promise((resolve) => { |
||||||
|
let totalHeight = 0; |
||||||
|
let distance = 500; |
||||||
|
let timer = setInterval(() => { |
||||||
|
let scrollHeight = document.body.scrollHeight; |
||||||
|
window.scrollBy(0, distance); |
||||||
|
totalHeight += distance; |
||||||
|
|
||||||
|
const currentLinks = document.querySelectorAll('a[href*="/post/"]').length; |
||||||
|
// Stop if we hit the limit, bottom of page, or a safety cap
|
||||||
|
if (totalHeight >= scrollHeight || currentLinks >= maxItems || totalHeight > 15000) { |
||||||
|
clearInterval(timer); |
||||||
|
resolve(); |
||||||
|
} |
||||||
|
}, 300); |
||||||
|
}); |
||||||
|
}, limit); |
||||||
|
|
||||||
|
// Extracting the URLs
|
||||||
|
const postUrls = await page.evaluate(() => { |
||||||
|
const links = Array.from(document.querySelectorAll('a[href*="/post/"]')); |
||||||
|
return links |
||||||
|
.map(a => a.href) |
||||||
|
// Filter for actual post links and ignore UI elements like repost/share buttons
|
||||||
|
.filter(href => { |
||||||
|
const isPost = href.includes('/post/'); |
||||||
|
const isNotAction = !href.includes('/reposts') && !href.includes('/replies'); |
||||||
|
return isPost && isNotAction; |
||||||
|
}); |
||||||
|
}); |
||||||
|
|
||||||
|
// Deduplicate using a Set
|
||||||
|
const uniqueUrls = [...new Set(postUrls)].slice(0, limit); |
||||||
|
|
||||||
|
console.log(`[Search Success] Found ${uniqueUrls.length} unique post URLs.`); |
||||||
|
|
||||||
|
return { |
||||||
|
keyword, |
||||||
|
count: uniqueUrls.length, |
||||||
|
urls: uniqueUrls |
||||||
|
}; |
||||||
|
|
||||||
|
} catch (error) { |
||||||
|
console.error(`[Search Error] ${error.message}`); |
||||||
|
throw error; |
||||||
|
} finally { |
||||||
|
await browser.close(); |
||||||
|
} |
||||||
|
} |
||||||
Loading…
Reference in new issue