You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

192 lines
6.7 KiB

import { jsonToAgent } from "./agent.js";
import { getThread } from "./scrapper.js";
import { searchThreads } from "./search.js";
import { writeFileSync, existsSync, mkdirSync, readdirSync, statSync } from "fs";
import dotenv from 'dotenv';
import { processData } from "./embeddings.js";
dotenv.config();
const cookies = [
{
name: 'sessionid',
value: '64605724719%3ALlZCmwghVyOAck%3A23%3AAYhptDoKttkGRWkpa5583neohBfLXlGfOlwPPmdP1w',
domain: '.threads.com',
path:'/',
httpOnly:true,
secure:true,
expires: Math.floor(Date.now()/1000)+60*60*24*365,
},
{
name: 'ds_user_id',
value: '64605724719',
domain: '.threads.com',
path:'/',
httpOnly:true,
secure:true,
expires: Math.floor(Date.now()/1000)+60*60*24*365,
},
{
name:'csrftoken',
value:'SI5YedKIeuSAgAbdtfynUwzrmSAGquxH',
domain: '.threads.com',
path:'/',
httpOnly:true,
secure:true,
expires: Math.floor(Date.now()/1000)+60*60*24*365,
}
];
const Keywords=[
"學貸 房租 焦慮","獨立 失敗","尷尬 年齡","還沒買房","卡住 職涯",
"躺平 世代","家境 差距","文組 標籤","覺醒 厭世","年輕",
"慣老闆 權益","行政 刁難","學生會 被架空","職場 不平等","家長會 決定",
"租屋 系統 惡意","升學 制度 不公","系統性 壓榨","黑箱 決策","系統 漏洞","校規", "SOP","行政 裁量權",
"租屋 格局 壓迫","辦公室 權力","校園 空間 設計","宿舍 小 隱私","空間 不舒服",
"會議室 借不到","圖書館 規定","公共空間 誰的","校車 收費 影響","畢業 退場",
"人生 妥協","我的 決定","邊界 被侵犯","下班 不讀","工作 生活 平衡",
"參與 沒用","開會 敷衍","假 民主","意見 被忽略","提案 被擋","連署 實質",
"迷因 轉化 焦慮","幽默 反抗","厭世 創作","負面情緒 詩意","話語權 奪回",
"歸屬感 飄","租來的 人生","網路 社群 溫暖","家鄉 不屬於","靈魂 避難所"
]
const Version="v2-1";
const DEBUG_MODE=false;
const SCRAP_TYPE='TAG'; // 'KEYWORD' or 'TAG'
async function step1(){
// const chooseKeywords=['人生 妥協','邊界 被侵犯','卡住 職涯','尷尬 年齡','厭世 創作'];
const chooseThemse=['職涯','00後整頓職場','生活方式','長大後才懂的事','privilege'];
const chooseKeywords=['成年 限制 學貸 獨立','學貸 房租 焦慮','職場 權利 privilege','職涯 嚮往','門禁 自由 限制','人生 妥協 期待','人生 自主'];
let select=SCRAP_TYPE==='KEYWORD'? chooseKeywords : chooseThemse;
// const keyword="工作 生活 平衡";
for(const keyword of select){
if(DEBUG_MODE){
if(keyword!==chooseKeywords[0] && SCRAP_TYPE==='KEYWORD') continue;
if(keyword!==chooseThemse[0] && SCRAP_TYPE==='TAG') continue;
}
// const searchResults = await searchThreads(keyword, 50, cookies, 'TAG');
const searchResults = await searchThreads(keyword, 50, cookies, SCRAP_TYPE);
console.log(JSON.stringify(searchResults));
for(const url of searchResults.urls){
try{
const threadItems=await getThread(url, cookies);
// console.log(JSON.stringify(threadItems));
// save to filesystem
const fileName=url.split("/post/")[1].split("/")[0];
// check if directory exists
const dir = `scrapped/${keyword}`;
if (!existsSync(dir)){
mkdirSync(dir, { recursive: true });
}
writeFileSync(`scrapped/${keyword}/${fileName}.json`, JSON.stringify(threadItems, null, 2) );
} catch(err){
console.error("Error processing", url, ":", err);
}
}
}
// const threadUrl="https://www.threads.com/@bobolove0828/post/DQ63pgpklmi";
// const threadItems=await getThread(threadUrl, cookies);
// console.log(JSON.stringify(threadItems));
// const fileName=threadUrl.split("/post/")[1].split("/")[0];
// writeFileSync(`scrapped/${fileName}.json`, JSON.stringify(threadItems, null, 2) );
}
async function step2(){
// list folders
const folders = readdirSync('./scrapped');
console.log("Folders in raw folder:", folders);
let count=2;//folders.length;
for(var i=0;i<folders.length;i++){
if(DEBUG_MODE && i>=count) break;
const folder=folders[i];
// check is folder
const isFolder = statSync(`./scrapped/${folder}`).isDirectory();
if (!isFolder) {
console.log(`Folder ${folder} is empty or does not exist.`);
continue;
}
const files = readdirSync(`./scrapped/${folder}`);
console.log(`Files in folder ${folder}:`, files);
files?.forEach(async (file, index) => {
if(DEBUG_MODE && index>0) return; // for testing, process only first file
try{
const data=await jsonToAgent(`./scrapped/${folder}/${file}`, 'agent_v2.txt');
// console.log("Agent response for folder", folder, ":", data);
// save to file
const outputFilePath = `./processed_${Version}/${folder}/${file}`;
// ensure directory exists
if (!existsSync(`./processed_${Version}/${folder}`)){
mkdirSync(`./processed_${Version}/${folder}`, { recursive: true });
}
writeFileSync(outputFilePath, JSON.stringify(data, null, 2));
console.log("Saved agent response to", outputFilePath);
}catch(err){
console.error("Error processing agent for folder", folder, ":", err);
}
});
}
}
async function step3(){
const folders = readdirSync(`./processed_${Version}`);
console.log("Folders in raw folder:", folders);
for(const folder of folders){
// check is folder
const isFolder = statSync(`./processed_${Version}/${folder}`).isDirectory();
if (!isFolder) {
console.log(`Folder ${folder} is empty or does not exist.`);
continue;
}
const files = readdirSync(`./processed_${Version}/${folder}`);
console.log(`Files in folder ${folder}:`, files);
files?.forEach(async (file, index) => {
try{
await processData(`./processed_${Version}/${folder}/${file}`, false, `data-v3`);
}catch(err){
console.error("Error processing embeddings for folder", folder, ":", err);
}
});
}
}
async function main(){
await step1();
// await getThread('https://www.threads.com/@pytteliten_/post/DJpeh8BoO3a', cookies);
// await step2();
// await step3();
}
main();