update scrapper

main
reng 1 month ago
parent 503ed29ffd
commit d8bb02058d
  1. 2
      .gitignore
  2. 7
      v2/app/package-lock.json
  3. 3
      v2/app/src/App.jsx
  4. 4
      v2/app/src/components/point.jsx
  5. 48
      v2/scrapper/main.js
  6. 1123
      v2/scrapper/package-lock.json
  7. 6
      v2/scrapper/package.json
  8. 187
      v2/scrapper/scrapper.js
  9. 114
      v2/scrapper/search.js

2
.gitignore vendored

@ -1,2 +1,4 @@
/v1/node/node_modules
/v1/python/cache
/v2/scrapper/node_modules
/v2/scrapper/scrapped

@ -3279,6 +3279,13 @@
"integrity": "sha512-W1CpvTHykaPH5brv5VHLfQo9D1OYuo0cSBEUQFFT/nBUzM8iD6Lq2/tgG/f1OelbAS1WtaTPQzE5uM49egnngw==",
"license": "MIT"
},
"node_modules/tslib": {
"version": "2.8.1",
"resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.1.tgz",
"integrity": "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==",
"license": "0BSD",
"optional": true
},
"node_modules/tunnel-rat": {
"version": "0.1.2",
"resolved": "https://registry.npmjs.org/tunnel-rat/-/tunnel-rat-0.1.2.tgz",

@ -4,7 +4,6 @@ import { processData, searchByText, searchByTheme, processTheme } from "./utils/
import Graph from "./components/graph";
import { processRawFiles } from "./utils/agent";
const files=['大家想到未來不會很絕望嗎','日常生活','有人懂嗎','看到學貸還款通知書寄來','租屋網站'];
const ContentTags=['Summary','Keywords','Order','User','Content'];
const DisplayTypes=['text','point'];
@ -42,7 +41,7 @@ function App() {
return (
<main className="flex flex-col justify-center gap-2 p-4">
<div>
<div className="flex flex-row flex-wrap gap-2 border rounded p-2">
<button onClick={processRaw}>process raw files</button>
<button onClick={preText}>add data embedding</button>
<button onClick={preTheme}>reset theme embedding</button>

@ -24,7 +24,7 @@ export default function Point({point, index, totalPoints, result, showContent})
console.log("Point lifeTime:", lifeTime);
// animate point size based on lifeTime
const targetSize=PointSize *0.2;
const targetSize=PointSize *0;
const initialSize=0;
const timeline=gsap.timeline({ repeat:-1});
@ -86,7 +86,7 @@ export default function Point({point, index, totalPoints, result, showContent})
/>
<Html>
<div ref={refText} className='text-white p-2 w-[20vw] select-none translate-x-[-50%] translate-y-[-50%] text-center opacity-0'>
<div className='flex flex-row justify-center flex-wrap'>
<div className='flex flex-row justify-center flex-wrap text=[2rem]'>
{payload?.keywords.map((el, index)=><span className='px-2' style={{transform: `translate(${Math.random() * KeywordOffset}px,${Math.random() * KeywordOffset}px)`}}>{el}</span>)}
</div>
{showContent && <pre className='text-xs w-[25vw] whitespace-pre-wrap bg-gray-300 text-black rounded p-2'>{(()=>{

@ -0,0 +1,48 @@
import { getThread } from "./scrapper.js";
import { searchThreads } from "./search.js";
import { writeFileSync } from "fs";
const cookies = [
{
name: 'sessionid',
value: '64605724719%3ALlZCmwghVyOAck%3A23%3AAYhptDoKttkGRWkpa5583neohBfLXlGfOlwPPmdP1w',
domain: '.threads.com'
},
{
name: 'ds_user_id',
value: '64605724719',
domain: '.threads.com'
},
{
name:'csrftoken',
value:'SI5YedKIeuSAgAbdtfynUwzrmSAGquxH',
domain:'.threads.com'
}
];
async function main(){
const searchResults = await searchThreads("厭世", 20, cookies);
console.log(JSON.stringify(searchResults));
for(const url of searchResults.urls){
try{
const threadItems=await getThread(url);
console.log(JSON.stringify(threadItems));
// save to filesystem
const fileName=url.split("/post/")[1].split("/")[0];
writeFileSync(`scrapped/${fileName}.json`, JSON.stringify(threadItems, null, 2) );
} catch(err){
console.error("Error processing", url, ":", err);
}
}
// const threadUrl="https://www.threads.com/@bobolove0828/post/DQ63pgpklmi";
// const threadItems=await getThread(threadUrl, cookies);
// console.log(JSON.stringify(threadItems));
// const fileName=threadUrl.split("/post/")[1].split("/")[0];
// writeFileSync(`scrapped/${fileName}.json`, JSON.stringify(threadItems, null, 2) );
}
main();

File diff suppressed because it is too large Load Diff

@ -0,0 +1,6 @@
{
"type":"module",
"dependencies": {
"puppeteer": "^24.34.0"
}
}

@ -0,0 +1,187 @@
import puppeteer from 'puppeteer';
/**
* 極限深度查找 (Ultimate Meta Relay Lookup)
* 專門針對 ScheduledServerJS -> __bbox -> RelayPrefetchedStreamCache 的參數結構
*/
function nestedLookup(obj, keys = ["thread_items", "replies", "child_posts", "threaded_replies", "post", "posts", "edges", "node", "reply_threads"]) {
let results = [];
if (typeof obj !== "object" || obj === null) return results;
if (Array.isArray(obj)) {
// 關鍵修正:檢查這是否為 RelayPrefetchedStreamCache 的呼叫陣列
// 格式通常是 ["RelayPrefetchedStreamCache", "next", null, ["key", {數據物件}]]
if (obj[0] === "RelayPrefetchedStreamCache" && Array.isArray(obj[3])) {
const relayData = obj[3][1]; // 取得包含數據的第二個參數
if (relayData) results = results.concat(nestedLookup(relayData, keys));
}
for (const item of obj) {
results = results.concat(nestedLookup(item, keys));
}
} else {
// 處理 __bbox 封裝結構
if (obj.__bbox?.require) {
results = results.concat(nestedLookup(obj.__bbox.require, keys));
}
if (obj.__bbox?.define) {
results = results.concat(nestedLookup(obj.__bbox.define, keys));
}
for (const k in obj) {
// 匹配目標鍵值
if (keys.includes(k) && obj[k] !== null) {
// 如果是列表結構,進一步遞迴
if ((k === "edges" || k === "thread_items" || k === "reply_threads") && Array.isArray(obj[k])) {
results = results.concat(nestedLookup(obj[k], keys));
} else {
results.push(obj[k]);
}
}
// 繼續深度遍歷
if (typeof obj[k] === "object" && obj[k] !== null) {
results = results.concat(nestedLookup(obj[k], keys));
}
}
}
return results;
}
/**
* 解析單一貼文資料 (支援更多 GraphQL 變體路徑)
*/
function parseThread(data) {
if (!data) return null;
// 遍歷所有可能的 Post 容器路徑
let post = data.post || data.node?.post || data.thread_items?.[0]?.post || (data.posts && data.posts[0]) || (data.caption ? data : null);
// 處理 GraphQL result.data 結構
if (!post && data.result?.data) {
const d = data.result.data;
post = d.text_post_app_info || d.post || d;
}
if (!post || (!post.id && !post.pk)) return null;
const mediaType = post.media_type;
let images = [];
let videos = [];
let videoThumbnail = null;
if (mediaType === 2 && post.video_versions?.length > 0) {
const highestResVideo = post.video_versions.reduce((max, cur) => (cur.width > max.width ? cur : max), post.video_versions[0]);
if (highestResVideo?.url) videos.push(highestResVideo.url);
if (post.image_versions2?.candidates?.length > 0) videoThumbnail = post.image_versions2.candidates[0].url;
}
else if (mediaType === 1 && post.image_versions2?.candidates?.length > 0) {
const highestResImage = post.image_versions2.candidates.reduce((max, cur) => (cur.width > max.width ? cur : max), post.image_versions2.candidates[0]);
if (highestResImage?.url) images.push(highestResImage.url);
}
else if (mediaType === 8 && post.carousel_media?.length > 0) {
post.carousel_media.forEach(m => {
if (m.media_type === 1 && m.image_versions2?.candidates?.length > 0) images.push(m.image_versions2.candidates[0].url);
if (m.media_type === 2 && m.video_versions?.length > 0) videos.push(m.video_versions[0].url);
});
}
const result = {
text: post.caption?.text || post.text_post_app_info?.share_info?.quoted_post?.caption?.text || "",
published_on: post.taken_at,
id: post.id || post.pk,
code: post.code,
username: post.user?.username,
// user_pic: post.user?.profile_pic_url,
like_count: post.like_count || 0,
reply_count: post.direct_reply_count || post.reply_count || 0,
// images,
// videos,
// video_thumbnail: videoThumbnail,
// url: post.user?.username && post.code ? `https://www.threads.net/@${post.user.username}/post/${post.code}` : null
};
return result.id ? result : null;
}
export async function getThread(postUrl) {
if (!postUrl?.includes("threads.")) throw new Error("無效的 Threads 網址");
const postCodeFromUrl = postUrl.split("/post/")[1]?.split("/")[0];
console.log(`[目標鎖定] 貼文代碼: ${postCodeFromUrl}`);
const browser = await puppeteer.launch({
headless: "new",
args: ['--disable-blink-features=AutomationControlled', '--no-sandbox', '--disable-setuid-sandbox']
});
try {
const page = await browser.newPage();
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
// 增加等待時間確保網路請求啟動
await page.goto(postUrl, { waitUntil: 'networkidle2', timeout: 30000 });
// 滾動是必須的,因為 Relay 串流需要滾動來觸發 JS 解析
await page.evaluate(async () => {
await new Promise((resolve) => {
let totalHeight = 0;
let distance = 500;
let timer = setInterval(() => {
let scrollHeight = document.body.scrollHeight;
window.scrollBy(0, distance);
totalHeight += distance;
if (totalHeight >= scrollHeight || totalHeight > 5000) {
clearInterval(timer);
resolve();
}
}, 100);
});
});
// 增加等待時間,讓 Streaming 區塊載入完畢
await new Promise(r => setTimeout(r, 5000));
const allScripts = await page.$$eval(
'script[type="application/json"]',
(scripts) => scripts.map((s) => s.textContent)
);
let allParsedItems = new Map();
allScripts.forEach((content) => {
if (!content || !content.includes("ScheduledServerJS")) return;
try {
const data = JSON.parse(content);
const rawItems = nestedLookup(data).flat();
rawItems.forEach(item => {
if (!item) return;
const parsed = parseThread(item);
if (parsed && parsed.id) {
allParsedItems.set(parsed.id, parsed);
}
});
} catch (e) {}
});
const itemsArray = Array.from(allParsedItems.values());
const mainThread = itemsArray.find(t => t.code === postCodeFromUrl);
if (!mainThread) {
throw new Error("無法定位主貼文。這通常是因為 Auth 阻擋或頁面未完全渲染。");
}
const authorName = mainThread.username;
const replies = itemsArray.filter(t =>
t.code !== postCodeFromUrl
).sort((a, b) => a.published_on - b.published_on);
console.log(`[解析成功] 找到主貼文,作者: ${authorName},回覆數量: ${replies.length}`);
return { thread: mainThread, replies };
} finally {
await browser.close();
}
}

@ -0,0 +1,114 @@
import puppeteer from 'puppeteer';
/**
* Threads Search Scraper
* Navigates to the search page, enters a keyword, and extracts post URLs.
* * @param {string} keyword - The search term.
* @param {number} limit - Maximum number of URLs to return.
* @param {Array} cookies - Optional: Array of Puppeteer-formatted cookies to handle login.
*/
export async function searchThreads(keyword, limit = 20, cookies = []) {
if (!keyword) {
throw new Error("Please provide a keyword for the search.");
}
const searchUrl = `https://www.threads.net/search?q=${encodeURIComponent(keyword)}&serp_type=default`;
console.log(`[Search Start] Keyword: "${keyword}" | URL: ${searchUrl}`);
const browser = await puppeteer.launch({
headless: "new",
args: [
'--disable-blink-features=AutomationControlled',
'--no-sandbox',
'--disable-setuid-sandbox'
]
});
try {
const page = await browser.newPage();
// Anti-detection headers
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36');
await page.setExtraHTTPHeaders({
'Accept-Language': 'en-US,en;q=0.9',
});
// Inject cookies if provided to bypass login wall
if (cookies && cookies.length > 0) {
console.log("[Auth] Injecting session cookies...");
await page.setCookie(...cookies);
}
await page.setViewport({ width: 1280, height: 900 });
// Navigate to search results
const response = await page.goto(searchUrl, { waitUntil: 'networkidle2', timeout: 60000 });
// Check if we were redirected to the login page
const currentUrl = page.url();
if (currentUrl.includes('/login')) {
console.error("[Auth Error] Threads redirected to login. You must provide valid session cookies.");
throw new Error("Authentication required: Threads search is restricted to logged-in users.");
}
// Wait for the results to start appearing
try {
// Threads search result container or post links
await page.waitForSelector('a[href*="/post/"]', { timeout: 20000 });
} catch (e) {
console.warn("[Warning] Results did not load. This might be a login wall or no results found.");
// Take a screenshot for debugging if needed (local environment only)
// await page.screenshot({ path: 'search_debug.png' });
}
// Scroll logic to gather more results
await page.evaluate(async (maxItems) => {
await new Promise((resolve) => {
let totalHeight = 0;
let distance = 500;
let timer = setInterval(() => {
let scrollHeight = document.body.scrollHeight;
window.scrollBy(0, distance);
totalHeight += distance;
const currentLinks = document.querySelectorAll('a[href*="/post/"]').length;
// Stop if we hit the limit, bottom of page, or a safety cap
if (totalHeight >= scrollHeight || currentLinks >= maxItems || totalHeight > 15000) {
clearInterval(timer);
resolve();
}
}, 300);
});
}, limit);
// Extracting the URLs
const postUrls = await page.evaluate(() => {
const links = Array.from(document.querySelectorAll('a[href*="/post/"]'));
return links
.map(a => a.href)
// Filter for actual post links and ignore UI elements like repost/share buttons
.filter(href => {
const isPost = href.includes('/post/');
const isNotAction = !href.includes('/reposts') && !href.includes('/replies');
return isPost && isNotAction;
});
});
// Deduplicate using a Set
const uniqueUrls = [...new Set(postUrls)].slice(0, limit);
console.log(`[Search Success] Found ${uniqueUrls.length} unique post URLs.`);
return {
keyword,
count: uniqueUrls.length,
urls: uniqueUrls
};
} catch (error) {
console.error(`[Search Error] ${error.message}`);
throw error;
} finally {
await browser.close();
}
}
Loading…
Cancel
Save