obsidian-sample-plugin/main.ts

2877 lines
148 KiB
TypeScript

import { App, Modal, Notice, Plugin, PluginSettingTab, Setting, requestUrl, TFile, Menu, MenuItem, FrontMatterCache, TFolder, TextComponent, TextAreaComponent, ButtonComponent } from 'obsidian';
import taskDefinitions from './task_definitions.json';
// ----- CONSTANTS -----
const SUBPROCESSOR_URL_KEYWORDS = [
'subprocessor', 'sub-processor', 'sub_processor',
'vendor-list', 'vendorlist', 'third-party-list', 'thirdpartylist',
'service-providers', 'serviceproviders',
'dpa-exhibit', 'dpa/exhibit', 'data-processing-addendum/exhibit',
'trust-center/sub', 'legal/subprocessors'
];
// ----- SETTINGS INTERFACE AND DEFAULTS -----
interface ProcessorProcessorSettings {
serpApiKey: string;
rightbrainClientId: string;
rightbrainClientSecret: string;
rightbrainOrgId: string;
rightbrainProjectId: string;
rightbrainVerifyUrlTaskId: string;
rightbrainExtractEntitiesTaskId: string;
rightbrainExtractInputField: string;
rightbrainExtractOutputThirdPartyField: string;
rightbrainExtractOutputOwnEntitiesField: string;
rightbrainDeduplicateSubprocessorsTaskId: string;
rightbrainDuckDuckGoSearchTaskId: string;
createPagesForOwnEntities: boolean;
verboseDebug: boolean;
maxResultsPerProcessor: number;
maxRecursiveDepth: number;
discoveryCacheDays: number;
processorsFolderPath: string;
analysisLogsFolderPath: string;
rightbrainFindDpaTaskId: string;
rightbrainFindTosTaskId: string;
rightbrainFindSecurityTaskId: string;
}
const DEFAULT_SETTINGS: ProcessorProcessorSettings = {
serpApiKey: '',
rightbrainClientId: '',
rightbrainClientSecret: '',
rightbrainOrgId: '',
rightbrainProjectId: '',
rightbrainVerifyUrlTaskId: '',
rightbrainExtractEntitiesTaskId: '',
rightbrainExtractInputField: 'page_text',
rightbrainExtractOutputThirdPartyField: 'third_party_subprocessors',
rightbrainExtractOutputOwnEntitiesField: 'own_entities',
rightbrainDeduplicateSubprocessorsTaskId: '',
rightbrainDuckDuckGoSearchTaskId: '',
createPagesForOwnEntities: false,
verboseDebug: false,
maxResultsPerProcessor: 1,
maxRecursiveDepth: 1,
discoveryCacheDays: 30,
processorsFolderPath: 'Processors',
analysisLogsFolderPath: 'Analysis Logs',
rightbrainFindDpaTaskId: '',
rightbrainFindTosTaskId: '',
rightbrainFindSecurityTaskId: '',
}
// ----- DATA STRUCTURES -----
interface SerpApiResult {
title: string; url: string; snippet: string; searchQuery?: string;
processorName: string; documentType: string;
sourceDpaUrl?: string;
}
interface ExtractedRelationship {
PrimaryProcessor: string; SubprocessorName: string; ProcessingFunction: string;
Location: string; RelationshipType: 'uses_subprocessor' | 'is_own_entity';
SourceURL: string; VerificationReasoning: string;
}
interface ProcessedUrlInfo extends Partial<SerpApiResult> {
url: string; title?: string; verificationMethod?: string; verificationReasoning?: string;
isList?: boolean; isCurrent?: boolean; extractedSubprocessorsCount?: number; documentType: string;
}
interface SearchData {
collectedRelationships: ExtractedRelationship[];
processedUrlDetails: ProcessedUrlInfo[];
flaggedCandidateUrlCount: number;
}
interface SubprocessorPageInfo {
file_path: string;
page_name: string;
aliases: string[];
}
interface DeduplicationResultItem {
survivor_file_path: string;
duplicate_file_paths: string[];
reasoning?: string;
}
// ----- MAIN PLUGIN CLASS -----
export default class ProcessorProcessorPlugin extends Plugin {
settings: ProcessorProcessorSettings;
private processedInCurrentRecursiveSearch: Set<string>;
async onload() {
this.processedInCurrentRecursiveSearch = new Set<string>();
await this.loadSettings();
this.addRibbonIcon('link', 'Manually Add Subprocessor List URL', (evt: MouseEvent) => {
new ManualInputModal(this.app, async (processorName, listUrl, isPrimary) => { // <-- Updated signature
if (processorName && listUrl) {
new Notice(`Processing manual URL input for: ${processorName}`);
const processorFile = await this.ensureProcessorFile(processorName, true, isPrimary); // <-- Pass flag
if (processorFile) {
const searchData = await this.fetchDataFromDirectUrl(processorName, listUrl);
if (searchData) {
await this.persistSubprocessorInfo(processorName, processorFile, searchData, isPrimary); // <-- Pass flag
if (searchData.flaggedCandidateUrlCount > 0) {
new Notice(`${searchData.flaggedCandidateUrlCount} URL(s) looked promising but couldn't be verified. Check logs.`);
}
} else {
new Notice(`Could not process data from direct URL for ${processorName}.`);
}
} else {
new Notice(`Could not create or find file for ${processorName} in ${this.settings.processorsFolderPath}`);
}
}
}).open();
});
this.addRibbonIcon('paste', 'Input Subprocessor List from Text', (evt: MouseEvent) => {
this.openManualTextEntryModal();
});
this.addCommand({
id: 'run-processor-search-global',
name: 'Search for Subprocessors (Discover)',
callback: () => {
new SearchModal(this.app, this.settings, async (processorName) => {
if (processorName) {
new Notice(`Starting discovery search for: ${processorName}`);
const processorFile = await this.ensureProcessorFile(processorName, true);
if (processorFile) {
await this.discoverAndProcessProcessorPage(processorName, processorFile);
} else {
new Notice(`Could not create or find file for ${processorName} in ${this.settings.processorsFolderPath}`);
}
}
}).open();
}
});
this.addCommand({
id: 'input-subprocessor-list-from-text',
name: 'Input Subprocessor List from Text',
callback: () => {
this.openManualTextEntryModal();
}
});
this.addCommand({
id: 'run-processor-search-recursive', // New ID
name: 'Search for Subprocessors (Recursive Discover)',
callback: () => {
new SearchModal(this.app, this.settings, async (processorName) => {
if (processorName) {
// Optional: Add a way for the user to set maxDepth, or use a default/setting
await this.discoverRecursively(processorName, undefined, this.plugin.settings.maxRecursiveDepth);
}
}).open();
}
});
this.addCommand({
id: 'force-merge-processors-from-palette',
name: 'Force Merge processor files...',
callback: () => {
this.openFileSelectorMergeModal();
}
});
this.addCommand({
id: 'complete-first-time-setup',
name: 'Complete First-Time Setup (Credentials & Tasks)',
callback: () => {
new PasteEnvModal(this.app, this).open();
}
});
this.registerEvent(
this.app.workspace.on('file-menu', (menu: Menu, fileOrFolder: TFile | TFolder, source: string) => {
// Logic for single folders
if (fileOrFolder instanceof TFolder) {
const folder = fileOrFolder as TFolder;
if (folder.path === this.settings.processorsFolderPath) {
menu.addItem((item: MenuItem) => {
item.setTitle('Deduplicate Subprocessor Pages')
.setIcon('git-pull-request-draft')
.onClick(async () => {
if (!this.settings.rightbrainDeduplicateSubprocessorsTaskId) {
new Notice("Deduplication Task ID not set in plugin settings.");
return;
}
new Notice(`Starting deduplication for folder: ${folder.path}`);
await this.runDeduplicationForFolder(folder);
});
});
}
}
// Logic for single files
else if (fileOrFolder instanceof TFile && fileOrFolder.extension === 'md') {
const file = fileOrFolder as TFile;
if (file.path.startsWith(this.settings.processorsFolderPath + "/")) {
const fileCache = this.app.metadataCache.getFileCache(file);
const frontmatter = fileCache?.frontmatter;
const originalProcessorName = (frontmatter?.aliases && Array.isArray(frontmatter.aliases) && frontmatter.aliases.length > 0)
? frontmatter.aliases[0]
: file.basename;
menu.addItem((item: MenuItem) => {
item.setTitle('Map Subprocessor Relationships')
.setIcon('chevrons-down-up')
.onClick(async () => {
new Notice(`Starting recursive discovery from: ${originalProcessorName}`);
await this.discoverRecursively(originalProcessorName, file, this.settings.maxRecursiveDepth);
});
});
menu.addItem((item: MenuItem) => {
item.setTitle('Discover Subprocessor List').setIcon('wand')
.onClick(async () => {
new Notice(`Discovering subprocessor list for: ${originalProcessorName}`);
await this.discoverAndProcessProcessorPage(originalProcessorName, file);
});
});
menu.addItem((item: MenuItem) => {
item.setTitle('Enrich Processor Documentation')
.setIcon('book-plus')
.onClick(async () => {
new Notice(`Enriching documentation for: ${originalProcessorName}`);
await this.enrichProcessorFile(originalProcessorName, file);
});
});
menu.addItem((item: MenuItem) => {
item.setTitle('Add Subprocessor List URL').setIcon('plus-circle')
.onClick(async () => {
new ManualInputModal(this.app, async (pName, listUrl, isPrimary) => {
if (listUrl) {
new Notice(`Processing manual URL input for: ${originalProcessorName} using URL: ${listUrl}`);
const searchData = await this.fetchDataFromDirectUrl(originalProcessorName, listUrl);
if (searchData) {
await this.persistSubprocessorInfo(originalProcessorName, file, searchData, isPrimary);
if (searchData.flaggedCandidateUrlCount > 0) {
new Notice(`${searchData.flaggedCandidateUrlCount} URL(s) looked promising but couldn't be verified. Check logs.`);
}
} else {
new Notice(`Could not process data from direct URL for ${originalProcessorName}.`);
}
}
}, originalProcessorName).open();
});
});
menu.addItem((item: MenuItem) => {
item.setTitle('Input Subprocessor List from Text').setIcon('file-input')
.onClick(async () => {
this.openManualTextEntryModal(originalProcessorName);
});
});
}
}
})
);
this.addSettingTab(new ProcessorProcessorSettingTab(this.app, this));
console.log('Processor Processor plugin loaded.');
}
onunload() {
console.log('Processor Processor plugin unloaded.');
}
async loadSettings() {
this.settings = Object.assign({}, DEFAULT_SETTINGS, await this.loadData());
}
async saveSettings() {
await this.saveData(this.settings);
}
private openManualTextEntryModal(initialProcessorName?: string) {
if (!this.settings.rightbrainExtractEntitiesTaskId) {
new Notice("RightBrain Task ID for entity extraction is not configured. Please set it in plugin settings.");
return;
}
new ManualTextEntryModal(this.app, async (processorName, pastedText, isPrimary) => {
if (processorName && pastedText) {
new Notice(`Processing pasted text for: ${processorName}`);
// Pass the 'isPrimary' flag to ensure the correct tag is applied
const processorFile = await this.ensureProcessorFile(processorName, true, isPrimary);
if (processorFile) {
const searchData = await this.fetchDataFromPastedText(processorName, pastedText);
if (searchData) {
// Pass the 'isPrimary' flag here as well for consistency
await this.persistSubprocessorInfo(processorName, processorFile, searchData, isPrimary);
} else {
new Notice(`Could not process data from pasted text for ${processorName}.`);
}
} else {
new Notice(`Could not create or find file for ${processorName} in ${this.settings.processorsFolderPath}`);
}
}
}, initialProcessorName).open();
}
private sanitizeNameForFilePathAndAlias(entityName: string | undefined | null): { filePathName: string, originalNameAsAlias: string } {
const originalName = (entityName || "Unknown Entity").trim();
let baseNameForFile = originalName;
// Check for "dba" patterns to prioritize the "doing business as" name for the file
const dbaRegex = /^(.*?)\s+(?:dba|d\/b\/a|doing business as)\s+(.*)$/i;
const dbaMatch = originalName.match(dbaRegex);
if (dbaMatch && dbaMatch[2]) { // dbaMatch[2] is the name after 'dba'
baseNameForFile = dbaMatch[2].trim(); // Use this as the base for the filename
}
// Remove commas from the base name for the file, as they can be problematic in links/tags
let filePathName = baseNameForFile.replace(/,/g, '');
// Replace characters forbidden in file paths
filePathName = filePathName.replace(/[\\/:*?"<>|]/g, '').trim();
// If filePathName becomes empty after sanitization (e.g., name was just "///"),
// use a sanitized version of the original full name or a fallback.
if (!filePathName) {
filePathName = originalName.replace(/[\\/:*?"<>|,]/g, '').replace(/\s+/g, '_') || "Sanitized_Entity";
}
if (!filePathName) { // Final fallback if it's still somehow empty
filePathName = "Sanitized_Entity_" + Date.now();
}
return {
filePathName: filePathName,
originalNameAsAlias: originalName // The original full name is always used as an alias
};
}
private scrubHyperlinks(text: string | undefined | null): string {
if (!text) return "N/A"; // Return "N/A" if input is null, undefined, or empty
let scrubbedText = String(text); // Ensure it's a string
// Remove Markdown links: [link text](url) -> link text
scrubbedText = scrubbedText.replace(/\[(.*?)\]\((?:.*?)\)/g, '$1');
// Remove HTML links: <a href="...">link text</a> -> link text
scrubbedText = scrubbedText.replace(/<a[^>]*>(.*?)<\/a>/gi, '$1');
// Strip any remaining HTML tags
scrubbedText = scrubbedText.replace(/<[^>]+>/g, '');
// Normalize whitespace (multiple spaces/newlines to single space)
scrubbedText = scrubbedText.replace(/\s+/g, ' ').trim();
return scrubbedText || "N/A"; // Return "N/A" if the result is an empty string
}
private addRelationship(
collectedRelationships: ExtractedRelationship[],
seenRelationships: Set<string>, // To track unique (PrimaryProcessor, SubprocessorName, Type) tuples
processorName: string, // The name of the primary processor this relationship pertains to
entity: any, // The raw entity object (e.g., from RightBrain)
type: ExtractedRelationship['RelationshipType'], // 'uses_subprocessor' or 'is_own_entity'
sourceUrl: string, // The URL where this information was found/verified
verificationReasoning: string | undefined | null // Reasoning from verification, if any
): number { // Returns 1 if a new relationship was added, 0 otherwise
const originalEntityName = entity.name?.trim();
if (!originalEntityName) return 0; // Skip if no name
// Use the original, unaltered entity name for storage and comparison
// The sanitization for file paths will happen later when creating files.
const subprocessorNameToStore = originalEntityName;
// Special handling for OpenAI - skip if it's identifying its own known affiliates as "own_entity"
// This prevents OpenAI from listing itself or its core components as if they were distinct subprocessors *of itself*.
if (processorName.toLowerCase() === "openai" && type === "is_own_entity") {
const openaiAffiliates = ["openai global", "openai, opco", "openai ireland", "openai uk", "openai japan", "openaiglobal", "openai opco", "openai llc"];
// If the entity name looks like one of OpenAI's own common names/affiliates, don't add it as an "own_entity" relationship for OpenAI.
if (openaiAffiliates.some(aff => originalEntityName.toLowerCase().includes(aff)) || originalEntityName.toLowerCase() === "openai") {
// if (this.settings.verboseDebug) console.log(`Skipping adding '${originalEntityName}' as own_entity for OpenAI due to self-reference/affiliate rule.`);
return 0;
}
}
// Create a unique tuple for this relationship to avoid duplicates *within the current processing run*
const relTuple = `${processorName}|${subprocessorNameToStore}|${type}`;
if (!seenRelationships.has(relTuple)) {
collectedRelationships.push({
PrimaryProcessor: processorName,
SubprocessorName: subprocessorNameToStore, // Store the original name
ProcessingFunction: this.scrubHyperlinks(entity.processing_function),
Location: this.scrubHyperlinks(entity.location),
RelationshipType: type,
SourceURL: sourceUrl,
VerificationReasoning: this.scrubHyperlinks(verificationReasoning)
});
seenRelationships.add(relTuple);
return 1;
}
return 0;
}
async discoverAndProcessProcessorPage(processorName: string, processorFile: TFile) {
new Notice(`Processing (discovery): ${processorName}...`);
const searchData = await this.fetchProcessorSearchDataWithDiscovery(processorName);
if (searchData) {
await this.persistSubprocessorInfo(processorName, processorFile, searchData);
if (searchData.flaggedCandidateUrlCount > 0) {
new Notice(`${searchData.flaggedCandidateUrlCount} URL(s) looked promising but couldn't be verified. Check Analysis Log for details and consider using the 'Input from Text' feature.`);
}
} else {
new Notice(`Failed to fetch data via discovery for ${processorName}.`);
}
}
async enrichProcessorFile(processorName: string, file: TFile) {
new Notice(`Fetching compliance documents for ${processorName}...`, 5000);
const rbToken = await this.getRightBrainAccessToken();
if (!rbToken) {
new Notice("Failed to get RightBrain token. Aborting enrichment.");
return;
}
const documentTypes = [
{ type: 'DPA', taskId: this.settings.rightbrainFindDpaTaskId, title: "Data Processing Agreement" },
{ type: 'ToS', taskId: this.settings.rightbrainFindTosTaskId, title: "Terms of Service" },
{ type: 'Security', taskId: this.settings.rightbrainFindSecurityTaskId, title: "Security Documentation" }
];
const foundDocuments: { title: string, url: string }[] = [];
for (const doc of documentTypes) {
if (!doc.taskId) {
if (this.settings.verboseDebug) console.log(`Skipping ${doc.type} search for ${processorName}, no Task ID set.`);
continue;
}
const taskInputPayload = { "company_name": processorName };
const taskResult = await this.callRightBrainTask(doc.taskId, taskInputPayload, rbToken);
// Assuming the RightBrain task returns a simple { "url": "..." } object
if (taskResult?.response?.url && this.isValidUrl(taskResult.response.url)) {
foundDocuments.push({ title: doc.title, url: taskResult.response.url });
if (this.settings.verboseDebug) console.log(`Found ${doc.type} for ${processorName}: ${taskResult.response.url}`);
} else {
if (this.settings.verboseDebug) console.warn(`Could not find valid URL for ${doc.type} for ${processorName}. Result:`, taskResult);
}
await new Promise(resolve => setTimeout(resolve, 500)); // Small delay between tasks
}
if (foundDocuments.length === 0) {
new Notice(`No new compliance documents found for ${processorName}.`);
return;
}
// Format the results into a markdown list
let markdownContent = "\n"; // Start with a newline to ensure separation
foundDocuments.forEach(doc => {
markdownContent += `- **${doc.title}:** [${doc.url}](${doc.url})\n`;
});
// Use ensureHeadingAndSection to append to the file
const heading = "Compliance Documentation";
await this.app.vault.process(file, (content: string) => {
// The 'true' at the end tells the function to append under the heading if it already exists.
// This prevents creating duplicate sections if you run enrichment multiple times.
return this.ensureHeadingAndSection(content, heading, markdownContent, null, null, true);
});
new Notice(`Successfully added ${foundDocuments.length} document link(s) to ${processorName}.`);
}
async setupRightBrainTasks() {
new Notice("Starting RightBrain task setup...", 3000);
const rbToken = await this.getRightBrainAccessToken();
if (!rbToken) {
new Notice("Setup failed: Could not get RightBrain Access Token.");
return;
}
const existingTasks = await this.listAllRightBrainTasks(rbToken);
if (existingTasks === null) {
new Notice("Setup failed: Could not retrieve existing tasks from RightBrain.");
return;
}
const existingTaskNames = new Set(existingTasks.map(task => task.name));
let tasksCreated = 0;
let tasksSkipped = 0;
for (const taskDef of taskDefinitions) {
if (existingTaskNames.has(taskDef.name)) {
new Notice(`Task '${taskDef.name}' already exists. Skipping.`);
tasksSkipped++;
} else {
new Notice(`Creating task: '${taskDef.name}'...`);
const createdTask = await this.createRightBrainTask(rbToken, taskDef);
if (createdTask) {
tasksCreated++;
// Optional: Automatically save the new Task ID to settings
// This part requires careful mapping between task names and setting keys
}
}
}
new Notice(`Setup complete. Created: ${tasksCreated} task(s), Skipped: ${tasksSkipped} existing task(s).`, 10000);
}
/**
* Fetches a list of all tasks from the configured RightBrain project.
* @param rbToken The RightBrain access token.
* @returns An array of task objects or null if an error occurs.
*/
private async listAllRightBrainTasks(rbToken: string): Promise<any[] | null> {
if (!this.settings.rightbrainOrgId || !this.settings.rightbrainProjectId) {
new Notice("RightBrain Org ID or Project ID not set.");
return null;
}
const tasksUrl = `https://stag.leftbrain.me/api/v1/org/${this.settings.rightbrainOrgId}/project/${this.settings.rightbrainProjectId}/task`;
const headers = { 'Authorization': `Bearer ${rbToken}` };
try {
const response = await requestUrl({ url: tasksUrl, method: 'GET', headers: headers, throw: false });
if (response.status === 200) {
// The API response nests the list under a 'tasks' key
return response.json.tasks || [];
} else {
console.error("Failed to list RightBrain tasks:", response.status, response.text);
return null;
}
} catch (error) {
console.error("Error fetching RightBrain tasks:", error);
return null;
}
}
/**
* Creates a single new task in RightBrain using a provided definition.
* @param rbToken The RightBrain access token.
* @param taskDefinition An object containing the full configuration for the new task.
* @returns The created task object or null if an error occurs.
*/
private async createRightBrainTask(rbToken: string, taskDefinition: any): Promise<any | null> {
const createUrl = `https://stag.leftbrain.me/api/v1/org/${this.settings.rightbrainOrgId}/project/${this.settings.rightbrainProjectId}/task`;
const headers = {
'Authorization': `Bearer ${rbToken}`,
'Content-Type': 'application/json'
};
try {
const response = await requestUrl({
url: createUrl,
method: 'POST',
headers: headers,
body: JSON.stringify(taskDefinition),
throw: false
});
if (response.status === 201 || response.status === 200) { // 201 = Created, 200 can also be success
new Notice(`Successfully created task: '${taskDefinition.name}'`);
return response.json;
} else {
new Notice(`Failed to create task '${taskDefinition.name}': ${response.status}`, 7000);
console.error(`Error creating task '${taskDefinition.name}':`, response.status, response.text);
return null;
}
} catch (error) {
console.error(`Network error creating task '${taskDefinition.name}':`, error);
return null;
}
}
async persistSubprocessorInfo(processorName: string, processorFile: TFile, searchData: SearchData, isTopLevelProcessor: boolean = true, mergeDecisions: string[] = []) {
new Notice(`Persisting info for: ${processorName}...`);
await this.ensureFolderExists(this.settings.processorsFolderPath);
await this.ensureFolderExists(this.settings.analysisLogsFolderPath);
const { collectedRelationships, processedUrlDetails } = searchData;
// Update the main processor file (e.g., "OpenAI.md")
await this.updateProcessorFile(processorFile, processorName, collectedRelationships, isTopLevelProcessor);
// Get unique target entity names (subprocessors or own_entities)
const uniqueTargetEntityOriginalNames = Array.from(new Set(collectedRelationships.map(r => r.SubprocessorName)));
const createdPagesForThisRun = new Set<string>(); // Track file paths created/updated in this run to avoid redundant ops
for (const targetEntityOriginalName of uniqueTargetEntityOriginalNames) {
const { filePathName: targetEntityFilePathName } = this.sanitizeNameForFilePathAndAlias(targetEntityOriginalName);
if (createdPagesForThisRun.has(targetEntityFilePathName)) {
// if (this.settings.verboseDebug) console.log(`Already processed page for ${targetEntityFilePathName} in this run, skipping.`);
continue;
}
// Get all relationships where this entity is the target (SubprocessorName)
const relationsWhereThisEntityIsTarget = collectedRelationships.filter(r => r.SubprocessorName === targetEntityOriginalName);
if (relationsWhereThisEntityIsTarget.length === 0) {
// if (this.settings.verboseDebug) console.log(`No relationships found for target ${targetEntityOriginalName}, skipping page creation.`);
continue; // Should not happen if it's in uniqueTargetEntityOriginalNames from collectedRelationships
}
// Determine if this entity is ever used as a subprocessor by *any* primary processor in the current batch
const isEverUsedAsSubprocessor = relationsWhereThisEntityIsTarget.some(r => r.RelationshipType === 'uses_subprocessor');
// Determine if this entity is an "own_entity" of the *current* primary processor being processed (processorName)
const isOwnEntityOfCurrentPrimaryProcessor = relationsWhereThisEntityIsTarget.some(
r => r.PrimaryProcessor === processorName && r.RelationshipType === 'is_own_entity'
);
let shouldCreatePage = false;
if (isEverUsedAsSubprocessor) {
shouldCreatePage = true; // Always create/update page if it's a subprocessor
// if (this.settings.verboseDebug) console.log(`Page for '${targetEntityOriginalName}' will be created/updated because it's used as a subprocessor.`);
} else if (isOwnEntityOfCurrentPrimaryProcessor) {
// If it's an own_entity of the current processor, create page only if setting is enabled
if (this.settings.createPagesForOwnEntities) {
shouldCreatePage = true;
// if (this.settings.verboseDebug) console.log(`Page for own_entity '${targetEntityOriginalName}' (of '${processorName}') will be created/updated due to setting.`);
} else {
// if (this.settings.verboseDebug) console.log(`Skipping page creation for own_entity '${targetEntityOriginalName}' (of '${processorName}') due to setting.`);
}
}
if (shouldCreatePage) {
// When creating/updating a subprocessor's page (e.g., "AWS.md"),
// we list all primary processors that use it as a subprocessor.
const clientRelationshipsForTargetEntityPage = collectedRelationships.filter(
r => r.SubprocessorName === targetEntityOriginalName && r.RelationshipType === 'uses_subprocessor'
);
await this.createOrUpdateSubprocessorFile(
targetEntityOriginalName, // The name of the subprocessor/own_entity itself
processorName, // The primary processor context (for logging/tracking, not for content of subprocessor's page directly)
clientRelationshipsForTargetEntityPage // Relationships where this entity is the subprocessor
);
createdPagesForThisRun.add(targetEntityFilePathName);
}
}
// Update the analysis log for the primary processor
await this.updateAnalysisLogPage(processorName, processedUrlDetails, collectedRelationships, mergeDecisions);
new Notice(`Finished persisting info for ${processorName}.`);
}
async searchViaRightBrainDuckDuckGo(processorName: string, rbToken: string): Promise<SerpApiResult[]> {
// The logic to create the task on the fly has been removed.
// We now just check if the setting is present.
if (!this.settings.rightbrainDuckDuckGoSearchTaskId) {
new Notice("DuckDuckGo Search Task ID is not configured. Please run the setup command or configure it in settings.", 10000);
return []; // Fail gracefully if the task ID is not set
}
const searchTaskId = this.settings.rightbrainDuckDuckGoSearchTaskId;
const searchQueries = this.generateSearchQueries(processorName);
const allResults: SerpApiResult[] = [];
const queriesToProcess = searchQueries.slice(0, Math.min(searchQueries.length, 2));
new Notice(`Performing up to ${queriesToProcess.length} DuckDuckGo searches for ${processorName}...`, 5000);
for (const query of queriesToProcess) {
const duckDuckGoUrl = `https://duckduckgo.com/?q=${encodeURIComponent(query)}&ia=web&kl=us-en&kp=-2`;
const taskInputPayload = {
search_url_to_process: duckDuckGoUrl,
target_company_name: processorName
};
if (this.settings.verboseDebug) {
console.log(`Calling RightBrain Task ${searchTaskId} for DDG search. URL: ${duckDuckGoUrl}, Target: ${processorName}`);
}
const taskRunResult = await this.callRightBrainTask(searchTaskId, taskInputPayload, rbToken);
if (this.settings.verboseDebug && taskRunResult) {
console.log(`Full RightBrain Response for DDG search query "${query}":`, JSON.stringify(taskRunResult, null, 2));
}
if (taskRunResult?.response?.search_results && Array.isArray(taskRunResult.response.search_results)) {
const resultsList: any[] = taskRunResult.response.search_results;
for (const result of resultsList) {
if (result.url && result.title && (String(result.url).startsWith("http://") || String(result.url).startsWith("https://"))) {
allResults.push({
processorName: processorName,
searchQuery: query,
title: String(result.title),
url: String(result.url),
snippet: String(result.snippet || ''),
documentType: 'duckduckgo_rb_search_result'
});
}
}
if (this.settings.verboseDebug) {
console.log(`Successfully processed ${resultsList.length} search results for query "${query}"`);
}
} else {
new Notice(`DDG search via RB for "${query.substring(0, 20)}..." yielded no valid results.`, 3000);
if (this.settings.verboseDebug) {
console.warn(`RB Task for DDG Search for query "${query}" did not return expected '{ "search_results": [...] }' array or failed. Full taskRunResult:`, taskRunResult);
}
}
await new Promise(resolve => setTimeout(resolve, 700 + Math.random() * 500));
}
if (this.settings.verboseDebug) console.log(`searchViaRightBrainDuckDuckGo collected ${allResults.length} filtered candidates for ${processorName}`);
return allResults;
}
async fetchProcessorSearchDataWithDiscovery(processorName: string): Promise<SearchData | null> {
const collectedRelationships: ExtractedRelationship[] = [];
const seenRelationshipsInCurrentSearch = new Set<string>(); // Tracks (Primary, Sub, Type)
const processedUrlDetails: ProcessedUrlInfo[] = []; // Log of all URLs processed
let candidateUrlsInfo: SerpApiResult[] = [];
let flaggedCandidateUrlCount = 0;
const rbToken = await this.getRightBrainAccessToken();
if (!rbToken) {
new Notice("Could not get RightBrain Access Token. Aborting discovery.", 7000);
return null;
}
// Step 1: Initial Search (SerpAPI or RightBrain/DDG)
if (this.settings.serpApiKey) {
new Notice(`Using SerpAPI for primary search for: ${processorName}`, 5000);
const searchQueries = this.generateSearchQueries(processorName);
const serpApiResults = await this.searchSerpApiForDpas(processorName, searchQueries, this.settings.maxResultsPerProcessor);
candidateUrlsInfo.push(...serpApiResults);
} else if (this.settings.rightbrainOrgId && this.settings.rightbrainProjectId) { // Check if RightBrain is configured for DDG
new Notice(`SerpAPI key not configured. Using DuckDuckGo (Filtered Extractor Task) via RightBrain for: ${processorName}`, 5000);
// This call now uses the RB Task that filters and parses DDG results
candidateUrlsInfo = await this.searchViaRightBrainDuckDuckGo(processorName, rbToken);
} else {
new Notice("No search method configured (SerpAPI or RightBrain for DDG). Aborting discovery.", 7000);
// No need to return null immediately, as hardcoded URLs might still be processed if verboseDebug is on.
}
// Hardcoded URLs for testing (if enabled)
const hardcodedTestUrls: Record<string, SerpApiResult[]> = {
// "openai": [{ title: "Test OpenAI SubP List", url: "https://example.com/openai-subp", snippet: "", processorName: "openai", documentType: "hardcoded_test" }],
}; // Keep this empty or manage it carefully
if (this.settings.verboseDebug && hardcodedTestUrls[processorName.toLowerCase()]) {
if (this.settings.verboseDebug) console.log(`Adding hardcoded test URLs for ${processorName}`);
candidateUrlsInfo.push(...hardcodedTestUrls[processorName.toLowerCase()]);
}
// Step 2: (Optional) Extract more URLs from already identified DPA/Subprocessor list pages
const additionalUrlsFromCandidatePages: SerpApiResult[] = [];
const pagesToScanForMoreLinks = candidateUrlsInfo.filter(
item => item.documentType === 'dpa_or_subprocessor_list' || SUBPROCESSOR_URL_KEYWORDS.some(kw => item.url.toLowerCase().includes(kw))
);
for (const pageItem of pagesToScanForMoreLinks) {
const extracted = await this.extractUrlsFromDpaPage(pageItem.url, processorName, pageItem.title);
additionalUrlsFromCandidatePages.push(...extracted);
}
candidateUrlsInfo.push(...additionalUrlsFromCandidatePages);
// Create a unique list of URLs to process, prioritizing earlier found ones.
const uniqueCandidateUrls = new Map<string, SerpApiResult>();
candidateUrlsInfo.forEach(item => {
if (item.url && (item.url.startsWith("http://") || item.url.startsWith("https://")) && !uniqueCandidateUrls.has(item.url.replace(/\/$/, ''))) { // Normalize URL by removing trailing slash
uniqueCandidateUrls.set(item.url.replace(/\/$/, ''), item);
}
});
const uniqueUrlsToProcess = Array.from(uniqueCandidateUrls.values());
if (this.settings.verboseDebug) console.log(`Total unique URLs to verify for ${processorName}: ${uniqueUrlsToProcess.length}`);
if (uniqueUrlsToProcess.length === 0 && candidateUrlsInfo.length > 0) {
if (this.settings.verboseDebug) console.warn(`All candidate URLs were invalid or duplicates for ${processorName}. Original count: ${candidateUrlsInfo.length}`);
} else if (uniqueUrlsToProcess.length === 0) {
new Notice(`No candidate URLs found to process for ${processorName}.`);
// No URLs to process, so return current state (likely empty relationships)
// return { collectedRelationships, processedUrlDetails, flaggedCandidateUrlCount };
}
// Step 3: Verify each unique URL and extract entities if it's a valid, current list
for (const urlInfo of uniqueUrlsToProcess) {
// Avoid re-processing if this exact URL (normalized) was somehow added to processedUrlDetails already
// This is a safeguard, uniqueUrlsToProcess should ideally handle this.
if (processedUrlDetails.some(p => p.url.replace(/\/$/, '') === urlInfo.url.replace(/\/$/, ''))) {
if(this.settings.verboseDebug) console.log(`URL ${urlInfo.url} already processed in processedUrlDetails, skipping re-verification.`);
continue;
}
let currentUrlExtractedCount = 0;
// Initialize processedUrlInfo for logging, merging urlInfo with defaults
let currentProcessedUrlInfo: ProcessedUrlInfo = { ...urlInfo, documentType: urlInfo.documentType || 'duckduckgo_rb_search_result' }; // Default type if not set
const verificationResult = await this.verifySubprocessorListUrl(urlInfo.url, processorName, rbToken);
currentProcessedUrlInfo = { // Update with verification attempt details
...currentProcessedUrlInfo,
verificationMethod: 'rightbrain', // Assuming RB is always used for verification now
isList: verificationResult?.isList || false,
isCurrent: verificationResult?.isCurrent || false, // isCurrent implies isList
verificationReasoning: verificationResult?.reasoning || 'N/A'
};
if (verificationResult?.isList && verificationResult.isCurrent && verificationResult.isCorrectProcessor) {
currentProcessedUrlInfo.documentType = 'verified_current_subprocessor_list';
if (verificationResult.pageContent) {
const extractionResult = await this.extractEntitiesFromPageContent(verificationResult.pageContent, rbToken);
if (extractionResult) {
const { thirdPartySubprocessors, ownEntities } = extractionResult;
thirdPartySubprocessors.forEach(e => {
currentUrlExtractedCount += this.addRelationship(collectedRelationships, seenRelationshipsInCurrentSearch, processorName, e, "uses_subprocessor", urlInfo.url, verificationResult.reasoning);
});
ownEntities.forEach(e => {
currentUrlExtractedCount += this.addRelationship(collectedRelationships, seenRelationshipsInCurrentSearch, processorName, e, "is_own_entity", urlInfo.url, verificationResult.reasoning);
});
} else { currentProcessedUrlInfo.documentType = 'verified_current_subprocessor_list (rb_extraction_failed)'; }
} else { currentProcessedUrlInfo.documentType = 'verified_current_subprocessor_list (no_content_for_extraction)';}
currentProcessedUrlInfo.extractedSubprocessorsCount = currentUrlExtractedCount;
processedUrlDetails.push(currentProcessedUrlInfo);
} else { // This block now catches lists that are not current OR not for the correct processor
const urlLower = urlInfo.url.toLowerCase();
const containsKeyword = SUBPROCESSOR_URL_KEYWORDS.some(keyword => urlLower.includes(keyword));
if (!verificationResult?.isList && containsKeyword) {
currentProcessedUrlInfo.documentType = 'keyword_match_not_verified_list';
flaggedCandidateUrlCount++;
} else if (verificationResult?.isList && !verificationResult.isCorrectProcessor) {
currentProcessedUrlInfo.documentType = 'verified_list_for_wrong_processor';
flaggedCandidateUrlCount++; // Also flag these as they are interesting but were correctly ignored
} else if (verificationResult?.isList) {
currentProcessedUrlInfo.documentType = 'verified_subprocessor_list (not_current)';
} else {
currentProcessedUrlInfo.documentType = 'not_a_subprocessor_list';
}
currentProcessedUrlInfo.extractedSubprocessorsCount = 0;
processedUrlDetails.push(currentProcessedUrlInfo);
}
} // End of loop through unique URLs
return { collectedRelationships, processedUrlDetails, flaggedCandidateUrlCount };
}
async fetchDataFromDirectUrl(processorName: string, listUrl: string): Promise<SearchData | null> {
if (this.settings.verboseDebug) console.log(`Fetching data from direct URL for ${processorName}: ${listUrl}`);
if (!this.isValidUrl(listUrl, processorName)) { // Basic URL validation
new Notice(`The provided URL for ${processorName} is not valid: ${listUrl}`);
return null;
}
const collectedRelationships: ExtractedRelationship[] = [];
const seenRelationshipsInCurrentSearch = new Set<string>();
const processedUrlDetails: ProcessedUrlInfo[] = [];
let flaggedCandidateUrlCount = 0;
const directUrlInfoBase: Partial<SerpApiResult> = { // Base info for this manually provided URL
title: `Manually Provided List for ${processorName}`, url: listUrl,
snippet: 'Manually provided URL', processorName: processorName, documentType: 'direct_input_list',
};
let currentProcessedUrlInfo: ProcessedUrlInfo = { ...directUrlInfoBase, url: listUrl, documentType: 'direct_input_list' };
const rbToken = await this.getRightBrainAccessToken();
if (!rbToken) { // RB token is essential for verification and extraction
new Notice("Could not obtain RightBrain token for direct URL processing.");
currentProcessedUrlInfo.verificationMethod = 'N/A (No RB Token)';
processedUrlDetails.push(currentProcessedUrlInfo); // Log the attempt
return { collectedRelationships, processedUrlDetails, flaggedCandidateUrlCount }; // Return with no data but with log
}
let currentUrlExtractedCount = 0;
const verificationResult = await this.verifySubprocessorListUrl(listUrl, processorName,rbToken);
// Update currentProcessedUrlInfo with verification details
currentProcessedUrlInfo.verificationMethod = 'rightbrain';
currentProcessedUrlInfo.isList = verificationResult?.isList || false;
currentProcessedUrlInfo.isCurrent = verificationResult?.isCurrent || false; // isCurrent implies isList
currentProcessedUrlInfo.verificationReasoning = verificationResult?.reasoning || 'N/A';
if (verificationResult && verificationResult.isList && verificationResult.isCurrent) {
new Notice(`Verified manual URL: ${listUrl} as current list.`);
currentProcessedUrlInfo.documentType = 'verified_current_subprocessor_list (manual_url_input)';
if (verificationResult.pageContent) {
const extractionResult = await this.extractEntitiesFromPageContent(verificationResult.pageContent, rbToken);
if (extractionResult) {
const { thirdPartySubprocessors, ownEntities } = extractionResult;
thirdPartySubprocessors.forEach(e => {
currentUrlExtractedCount += this.addRelationship(collectedRelationships, seenRelationshipsInCurrentSearch, processorName, e, "uses_subprocessor", listUrl, verificationResult.reasoning);
});
ownEntities.forEach(e => {
currentUrlExtractedCount += this.addRelationship(collectedRelationships, seenRelationshipsInCurrentSearch, processorName, e, "is_own_entity", listUrl, verificationResult.reasoning);
});
} else { currentProcessedUrlInfo.documentType = 'verified_current_subprocessor_list (manual_url_input_rb_extraction_failed)';}
} else {currentProcessedUrlInfo.documentType = 'verified_current_subprocessor_list (manual_url_input_no_content)';}
} else { // Not verified as current and valid, or verification failed
const urlLower = listUrl.toLowerCase();
const containsKeyword = SUBPROCESSOR_URL_KEYWORDS.some(keyword => urlLower.includes(keyword));
if (!verificationResult?.isList && containsKeyword) { // Looks like one (keyword), but RB says no
currentProcessedUrlInfo.documentType = 'keyword_match_not_verified_list (manual_url_input)';
flaggedCandidateUrlCount++;
new Notice(`Manual URL ${listUrl} looks like a subprocessor list but couldn't be verified. Reason: ${this.scrubHyperlinks(verificationResult?.reasoning) || 'Details unavailable.'}`);
if (this.settings.verboseDebug) console.log(`Flagged Manual URL (keyword match, not verified): ${listUrl}`);
} else if (verificationResult?.isList) { // RB says it's a list, but not current
currentProcessedUrlInfo.documentType = 'verified_subprocessor_list (manual_url_input_not_current)';
new Notice(`Manual URL ${listUrl} verified as a list, but not current. Reason: ${this.scrubHyperlinks(verificationResult?.reasoning) || 'Details unavailable.'}`);
} else { // RB says not a list, or verification failed more broadly
currentProcessedUrlInfo.documentType = 'not_a_subprocessor_list (manual_url_input)';
new Notice(`Manual URL ${listUrl} could not be verified as a list. Reason: ${this.scrubHyperlinks(verificationResult?.reasoning) || 'Details unavailable.'}`);
}
}
currentProcessedUrlInfo.extractedSubprocessorsCount = currentUrlExtractedCount;
processedUrlDetails.push(currentProcessedUrlInfo); // Log the processing attempt for this URL
return { collectedRelationships, processedUrlDetails, flaggedCandidateUrlCount };
}
async fetchDataFromPastedText(processorName: string, pastedText: string): Promise<SearchData | null> {
if (this.settings.verboseDebug) console.log(`Fetching data from pasted text for ${processorName}`);
if (!this.settings.rightbrainExtractEntitiesTaskId) { // Check for the specific task ID
new Notice("RightBrain Task ID for entity extraction is not configured. Please set it in plugin settings.");
return null;
}
const collectedRelationships: ExtractedRelationship[] = [];
const seenRelationshipsInCurrentSearch = new Set<string>();
const processedUrlDetails: ProcessedUrlInfo[] = []; // To log this "text processing" event
const rbToken = await this.getRightBrainAccessToken();
if (!rbToken) { // RB token is essential
new Notice("Could not obtain RightBrain token for pasted text processing.");
// Log this attempt as a failure due to no token
processedUrlDetails.push({
url: `text_input_for_${this.sanitizeNameForFilePathAndAlias(processorName).filePathName}`, // Placeholder URL for logging
title: `Pasted Text for ${processorName}`,
documentType: 'manual_text_submission_failed (no_rb_token)',
// No verification details applicable here as the process couldn't start
});
return { collectedRelationships, processedUrlDetails, flaggedCandidateUrlCount: 0 };
}
// Prepare input for the RB task based on configured field name
const taskInput = { [this.settings.rightbrainExtractInputField]: pastedText };
const extractionResult = await this.callRightBrainTask(this.settings.rightbrainExtractEntitiesTaskId, taskInput, rbToken);
let currentUrlExtractedCount = 0;
const sourcePlaceholder = `manual_text_input:${processorName}`; // For the SourceURL field
if (extractionResult && typeof extractionResult.response === 'object' && extractionResult.response !== null) {
const rbResponse = extractionResult.response;
// Access extracted entities using configured field names
const thirdParty = rbResponse[this.settings.rightbrainExtractOutputThirdPartyField] || [];
const own = rbResponse[this.settings.rightbrainExtractOutputOwnEntitiesField] || [];
thirdParty.forEach((e: any) => { // Assuming 'e' is an object with 'name', 'processing_function', 'location'
currentUrlExtractedCount += this.addRelationship(collectedRelationships, seenRelationshipsInCurrentSearch, processorName, e, "uses_subprocessor", sourcePlaceholder, "Processed from manually pasted text.");
});
own.forEach((e: any) => {
currentUrlExtractedCount += this.addRelationship(collectedRelationships, seenRelationshipsInCurrentSearch, processorName, e, "is_own_entity", sourcePlaceholder, "Processed from manually pasted text.");
});
// Log successful processing
processedUrlDetails.push({
url: sourcePlaceholder,
title: `Pasted Text for ${processorName}`,
documentType: 'manual_text_submission_processed',
verificationMethod: 'rightbrain_text_task',
extractedSubprocessorsCount: currentUrlExtractedCount,
verificationReasoning: `Extracted ${currentUrlExtractedCount} entities from pasted text.`
});
} else { // RB task failed or returned unexpected format
new Notice(`Failed to extract entities from pasted text for ${processorName}. Check console.`);
console.error(`ProcessorProcessor: RB Extract From Text task did not return expected 'response' object or failed. Full task result:`, JSON.stringify(extractionResult).substring(0,500));
// Log failed processing
processedUrlDetails.push({
url: sourcePlaceholder,
title: `Pasted Text for ${processorName}`,
documentType: 'manual_text_submission_failed (rb_task_error)',
verificationMethod: 'rightbrain_text_task',
verificationReasoning: 'RightBrain task for text processing failed or returned an unexpected response.'
});
}
return { collectedRelationships, processedUrlDetails, flaggedCandidateUrlCount: 0 }; // flaggedCandidateUrlCount is 0 for text input
}
private async ensureFolderExists(folderPath: string): Promise<void> {
// Normalize path: remove leading slash if present, as vault paths are relative to vault root
try {
const normalizedPath = folderPath.startsWith('/') ? folderPath.substring(1) : folderPath;
if (normalizedPath === '') return; // Do nothing if path is empty (e.g. root, though not typical for this use)
const abstractFolderPath = this.app.vault.getAbstractFileByPath(normalizedPath);
if (!abstractFolderPath) {
await this.app.vault.createFolder(normalizedPath);
if (this.settings.verboseDebug) console.log(`Folder created: ${normalizedPath}`);
}
// else { if (this.settings.verboseDebug) console.log(`Folder already exists: ${normalizedPath}`); }
} catch (e) {
// Don't throw, but log and notify. The operation might still proceed if the folder exists but an error occurred checking.
console.error(`Error ensuring folder ${folderPath} exists:`, e);
new Notice(`Error creating folder: ${folderPath}`);
}
}
private async ensureProcessorFile(originalProcessorName: string, addFrontmatter: boolean = false, isTopLevelProcessor: boolean = true): Promise<TFile | null> {
await this.ensureFolderExists(this.settings.processorsFolderPath);
const { filePathName, originalNameAsAlias } = this.sanitizeNameForFilePathAndAlias(originalProcessorName);
const folder = this.settings.processorsFolderPath.startsWith('/') ? this.settings.processorsFolderPath.substring(1) : this.settings.processorsFolderPath;
const filePath = `${folder}/${filePathName}.md`;
let file = this.app.vault.getAbstractFileByPath(filePath) as TFile;
if (!file) {
try {
let initialContent = "";
if (addFrontmatter) {
const tag = isTopLevelProcessor ? 'processor' : 'subprocessor';
const aliasForFrontmatter = originalNameAsAlias.replace(/[:\[\],"]/g, '');
initialContent = `---\ntags: [${tag}]\naliases: ["${aliasForFrontmatter}"]\n---\n\n# ${originalNameAsAlias}\n\n`;
} else {
initialContent = `# ${originalNameAsAlias}\n\n`;
}
file = await this.app.vault.create(filePath, initialContent);
} catch (e: any) {
if (e.message?.toLowerCase().includes("file already exists")) {
file = this.app.vault.getAbstractFileByPath(filePath) as TFile;
if (!file) { console.error(`Failed to get file ${filePath} after 'already exists' error.`); return null; }
} else { console.error(`Error creating processor file ${filePath}:`, e); return null; }
}
}
if (file && addFrontmatter) {
const tag = isTopLevelProcessor ? 'processor' : 'subprocessor';
const aliasForFrontmatter = originalNameAsAlias.replace(/[:\[\],"]/g, '');
await this.app.vault.process(file, (content) => {
let newContent = this.updateFrontmatter(content, { tags: [tag], aliases: [aliasForFrontmatter] }, originalNameAsAlias);
if (!newContent.trim().includes(`# ${originalNameAsAlias}`)) {
const bodyStartIndex = newContent.indexOf('\n---') > 0 ? newContent.indexOf('\n---', newContent.indexOf('\n---') + 3) + 4 : 0;
const body = newContent.substring(bodyStartIndex);
const frontmatterPart = newContent.substring(0, bodyStartIndex);
newContent = frontmatterPart + (frontmatterPart.endsWith("\n") ? "" : "\n") + `# ${originalNameAsAlias}\n\n` + body.trimStart();
}
return newContent;
});
}
return file;
}
private async updateProcessorFile(file: TFile, originalProcessorName: string, relationships: ExtractedRelationship[], isTopLevelProcessor: boolean) {
const subprocessorsHeading = "Subprocessors";
let tableMd = `| Subprocessor Entity Name | Processing Function | Location |\n`;
tableMd += `|---|---|---|\n`;
// Filter for 'uses_subprocessor' relationships where the current processor is the PrimaryProcessor
const relevantRelationships = relationships.filter(r => r.RelationshipType === 'uses_subprocessor' && r.PrimaryProcessor === originalProcessorName);
relevantRelationships.forEach(rel => {
// Sanitize the subprocessor's name for file path and get original for alias
const { filePathName: subFilePathName, originalNameAsAlias: subOriginalName } = this.sanitizeNameForFilePathAndAlias(rel.SubprocessorName);
// Prepare display alias for Markdown link (remove chars that break links/display)
const markdownAlias = subOriginalName.replace(/\n/g, ' ').replace(/[\[\]()|]/g, ''); // Basic sanitization for link text
const processorsFolder = this.settings.processorsFolderPath; // No leading/trailing slashes needed by encodeURI if path is clean
const markdownLinkTarget = encodeURI(`${processorsFolder}/${subFilePathName}.md`); // Use sanitized name for link target
const subprocessorPageLink = `[${markdownAlias}](${markdownLinkTarget})`; // Use standard Markdown link format
// Scrub and prepare display for processing function and location
const processingFunctionDisplay = (rel.ProcessingFunction || "N/A").replace(/\n/g, "<br>").replace(/\|/g, "\\|");
const locationDisplay = (rel.Location || "N/A").replace(/\n/g, "<br>").replace(/\|/g, "\\|");
tableMd += `| ${subprocessorPageLink} | ${processingFunctionDisplay} | ${locationDisplay} |\n`;
});
const analysisLogsHeading = "Analysis Logs";
// Sanitize processor name for log file name part
const { filePathName: logFilePathNamePart } = this.sanitizeNameForFilePathAndAlias(originalProcessorName);
const analysisLogsFolder = this.settings.analysisLogsFolderPath; // Normalized
const logFileName = `${logFilePathNamePart} Subprocessor Logs.md`;
const logFileLinkTarget = encodeURI(`${analysisLogsFolder}/${logFileName}`); // Use sanitized name for log file link
const logFileLink = `[[${analysisLogsFolder}/${logFileName}|${originalProcessorName} Subprocessor Logs]]`; // Obsidian link to log
const analysisLogSection = `\n- ${logFileLink}\n`;
await this.app.vault.process(file, (content: string) => {
const tag = isTopLevelProcessor ? 'processor' : 'subprocessor';
let newContent = this.updateFrontmatter(content, { tags: [tag], aliases: [originalProcessorName.replace(/[:\[\],"]/g, '')] }, originalProcessorName);
// Ensure H1 heading for originalProcessorName
if (!newContent.trim().includes(`# ${originalProcessorName}`)) {
const bodyStartIndex = newContent.indexOf('\n---') > 0 ? newContent.indexOf('\n---', newContent.indexOf('\n---') + 3) + 4 : 0;
const body = newContent.substring(bodyStartIndex);
const frontmatterPart = newContent.substring(0, bodyStartIndex);
newContent = frontmatterPart + (frontmatterPart.endsWith("\n") ? "" : "\n") + `# ${originalProcessorName}\n\n` + body.trimStart();
}
// Ensure/Update Subprocessors section
newContent = this.ensureHeadingAndSection(newContent, subprocessorsHeading, tableMd, null, null); // Replace entire section
// Ensure/Update Analysis Logs section
newContent = this.ensureHeadingAndSection(newContent, analysisLogsHeading, analysisLogSection, null, null, true); // Append if heading exists, else create
return newContent;
});
}
private async createOrUpdateSubprocessorFile(
originalSubprocessorName: string, // The name of the subprocessor itself (e.g., "AWS")
originalPrimaryProcessorNameForContext: string, // The primary processor currently being processed (e.g., "OpenAI") - for context, not usually main content of AWS.md
newClientRelationships: ExtractedRelationship[] // Relationships where originalSubprocessorName is the target (SubprocessorName) and type is 'uses_subprocessor'
) {
await this.ensureFolderExists(this.settings.processorsFolderPath);
const { filePathName: subFilePathName, originalNameAsAlias: subOriginalNameAsAlias } = this.sanitizeNameForFilePathAndAlias(originalSubprocessorName);
const folder = this.settings.processorsFolderPath.startsWith('/') ? this.settings.processorsFolderPath.substring(1) : this.settings.processorsFolderPath;
const subFilePath = `${folder}/${subFilePathName}.md`;
let file = this.app.vault.getAbstractFileByPath(subFilePath) as TFile;
if (!file) {
const aliasForFrontmatter = subOriginalNameAsAlias.replace(/[:\[\],"]/g, '');
const initialContent = `---\ntags: [subprocessor]\naliases: ["${aliasForFrontmatter}"]\n---\n\n# ${subOriginalNameAsAlias}\n\n## Used By\n\n`;
try {
file = await this.app.vault.create(subFilePath, initialContent);
} catch (e: any) {
if (e.message?.toLowerCase().includes("file already exists")) {
file = this.app.vault.getAbstractFileByPath(subFilePath) as TFile;
if (!file) { console.error(`Failed to get subprocessor file ${subFilePath} after 'already exists' error.`); return; }
} else { console.error(`Error creating subprocessor file ${subFilePath}:`, e); return; }
}
}
if (!file) return; // Should not happen if creation/retrieval was successful
await this.app.vault.process(file, (content: string) => {
let newContent = this.updateFrontmatter(content, { tags: ["subprocessor"], aliases: [subOriginalNameAsAlias.replace(/[:\[\],"]/g, '')] }, subOriginalNameAsAlias);
if (!newContent.trim().includes(`# ${subOriginalNameAsAlias}`)) {
const bodyStartIndex = newContent.indexOf('\n---') > 0 ? newContent.indexOf('\n---', newContent.indexOf('\n---') + 3) + 4 : 0;
const body = newContent.substring(bodyStartIndex);
const frontmatterPart = newContent.substring(0, bodyStartIndex);
newContent = frontmatterPart + (frontmatterPart.endsWith("\n") ? "" : "\n") + `# ${subOriginalNameAsAlias}\n\n` + body.trimStart();
}
const usedByHeading = "Used By";
// Step 1: Extract existing rows and put them in a Set to handle uniqueness
const existingRows = this.extractClientTableRows(content);
const allRows = new Set<string>(existingRows);
// Step 2: Process new relationships and add them to the Set
newClientRelationships.forEach(rel => {
const { filePathName: primaryFilePathName, originalNameAsAlias: primaryOriginalName } = this.sanitizeNameForFilePathAndAlias(rel.PrimaryProcessor);
const markdownPrimaryAlias = primaryOriginalName.replace(/\n/g, ' ').replace(/[\[\]()|]/g, '');
// Using the corrected Markdown link format for tables
const primaryLinkTarget = encodeURI(`${this.settings.processorsFolderPath}/${primaryFilePathName}.md`);
const primaryProcessorLink = `[${markdownPrimaryAlias}](${primaryLinkTarget})`;
const processingFunctionDisplay = (rel.ProcessingFunction || "N/A").replace(/\n/g, "<br>").replace(/\|/g, "\\|");
const locationDisplay = (rel.Location || "N/A").replace(/\n/g, "<br>").replace(/\|/g, "\\|");
const sourceUrlLink = rel.SourceURL.startsWith("http") ? `[Source](${rel.SourceURL})` : rel.SourceURL;
// Create the inner content of the row to be added to the Set
const rowContent = ` ${primaryProcessorLink} | ${processingFunctionDisplay} | ${locationDisplay} | ${sourceUrlLink} `;
allRows.add(rowContent);
});
// Step 3: Build the final, complete table from the Set of all rows
let clientTableMd = `| Primary Processor | Processing Function | Location | Source URL |\n`;
clientTableMd += `|---|---|---|---|\n`;
allRows.forEach(row => {
// Re-add the outer pipes for each row
clientTableMd += `|${row}|\n`;
});
// Step 4: Replace the old section with the new, merged table
newContent = this.ensureHeadingAndSection(newContent, usedByHeading, clientTableMd, null, null);
return newContent;
});
}
private updateFrontmatter(content: string, updates: { tags?: string[], aliases?: string[] }, pageNameForAlias: string): string {
let fm: any = {};
const fmRegex = /^---\s*\n([\s\S]*?)\n---\s*\n/;
const match = content.match(fmRegex);
let body = content;
if (match && match[1]) {
try {
// Basic YAML parsing - for more complex YAML, a library would be needed
const yamlLines = match[1].split('\n');
yamlLines.forEach(line => {
const parts = line.split(':');
if (parts.length >= 2) {
const key = parts[0].trim();
const value = parts.slice(1).join(':').trim();
if (key === 'tags' || key === 'aliases') {
// Try to parse as array if it looks like one, otherwise treat as string
if (value.startsWith('[') && value.endsWith(']')) {
fm[key] = value.substring(1, value.length - 1).split(',').map(s => s.trim().replace(/^["']|["']$/g, ''));
} else { // Single item not in list format
fm[key] = [value.replace(/^["']|["']$/g, '')];
}
} else {
fm[key] = value.replace(/^["']|["']$/g, ''); // Simple string value
}
}
});
} catch (e) {
console.warn("ProcessorProcessor: Could not parse existing frontmatter, will overwrite relevant keys.", e);
fm = {}; // Reset if parsing fails
}
body = content.substring(match[0].length);
}
// Update tags
if (updates.tags) {
const currentTags = new Set(Array.isArray(fm.tags) ? fm.tags.map((t: string) => String(t).toLowerCase()) : []);
updates.tags.forEach(tag => currentTags.add(String(tag).toLowerCase()));
fm.tags = Array.from(currentTags);
}
// Update aliases, ensuring pageNameForAlias (sanitized) is present
if (updates.aliases) {
const currentAliases = new Set(Array.isArray(fm.aliases) ? fm.aliases.map((a: string) => String(a)) : []);
updates.aliases.forEach(alias => {
const sanitizedAlias = String(alias).replace(/[:\[\],"]/g, ''); // Sanitize for YAML
if (sanitizedAlias) currentAliases.add(sanitizedAlias);
});
// Ensure the main pageNameForAlias (sanitized) is also present as an alias
const sanitizedPageNameAlias = String(pageNameForAlias).replace(/[:\[\],"]/g, '');
if (sanitizedPageNameAlias) currentAliases.add(sanitizedPageNameAlias);
fm.aliases = Array.from(currentAliases);
}
// Reconstruct frontmatter string
let fmString = "---\n";
for (const key in fm) {
if (fm.hasOwnProperty(key)) {
if (Array.isArray(fm[key])) {
if (fm[key].length > 0) {
fmString += `${key}: [${fm[key].map((item: string) => `"${item}"`).join(', ')}]\n`;
}
} else {
fmString += `${key}: "${fm[key]}"\n`;
}
}
}
fmString += "---\n";
// If there was no original frontmatter and we didn't actually add any valid fm, don't prepend empty fm block
if (fmString === "---\n---\n" && !match) {
return body;
}
return fmString + body;
}
private async updateAnalysisLogPage(processorName: string, processedUrls: ProcessedUrlInfo[], relationships: ExtractedRelationship[], mergeDecisions: string[]) {
await this.ensureFolderExists(this.settings.analysisLogsFolderPath);
const { filePathName: sanitizedProcessorNameForLogFile } = this.sanitizeNameForFilePathAndAlias(processorName);
const logsFolder = this.settings.analysisLogsFolderPath; // Normalized
const logFileName = `${sanitizedProcessorNameForLogFile} Subprocessor Logs.md`; // Use sanitized name
const logFilePath = `${logsFolder}/${logFileName}`;
const logEntryContent = this.formatResultsForObsidianLog(processorName, relationships, processedUrls, mergeDecisions);
// Use ensure_exists_and_append mode. The title is handled by formatResultsForObsidianLog.
await this.writeResultsToObsidianNote(logFilePath, logEntryContent, 'ensure_exists_and_append', processorName);
}
private ensureHeadingAndSection(
content: string,
headingText: string,
sectionNewContent: string,
startMarker: string | null = null, // e.g., <!-- START: SUBPROCESSORS -->
endMarker: string | null = null, // e.g., <!-- END: SUBPROCESSORS -->
appendUnderHeadingIfNoMarkers = false // If true and markers not found, appends under existing heading if found
): string {
const headingRegex = new RegExp(`^(#+)\\s*${headingText.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}(\\s*\\n|$)`, "im");
const headingMatch = content.match(headingRegex);
const sectionWithHeading = `\n## ${headingText}\n${sectionNewContent.trim()}\n`;
if (startMarker && endMarker) {
const startIdx = content.indexOf(startMarker);
const endIdx = content.indexOf(endMarker);
if (startIdx !== -1 && endIdx !== -1 && startIdx < endIdx) {
// Markers found, replace content between them (exclusive of markers themselves)
return content.substring(0, startIdx + startMarker.length) +
`\n${sectionNewContent.trim()}\n` + // Ensure new content is on new lines
content.substring(endIdx);
}
}
// Markers not used or not found, try to find heading
if (headingMatch) {
// Heading found
const headingLevel = headingMatch[1].length; // e.g., "##" -> length 2
const nextHeadingRegex = new RegExp(`^#{1,${headingLevel}}\\s+.*(\\s*\\n|$)`, "im");
let startIndexAfterHeading = headingMatch.index! + headingMatch[0].length;
let contentAfterHeading = content.substring(startIndexAfterHeading);
let endIndex = content.length; // Default to end of content
// Find where the current section ends (start of next heading of same or higher level, or end of doc)
const nextMatch = contentAfterHeading.match(nextHeadingRegex);
if (nextMatch) {
endIndex = startIndexAfterHeading + nextMatch.index!;
}
if (appendUnderHeadingIfNoMarkers) {
// Append new content under the existing heading, before the next one.
// This is tricky if the section already has content. This simple append adds to the end of the section.
// For full replacement, the logic outside this `if` handles it.
return content.substring(0, endIndex) + // Content up to where next section would start (or end of doc)
`\n${sectionNewContent.trim()}\n` + // Append new stuff
content.substring(endIndex); // Rest of the document
} else {
// Replace content from after the headingMatch to where the next heading/end of doc starts
return content.substring(0, startIndexAfterHeading) +
`${sectionNewContent.trim()}\n` +
content.substring(endIndex);
}
} else {
// Heading not found, append the new heading and section to the end
return content.trimEnd() + "\n\n" + sectionWithHeading.trimStart();
}
}
private formatResultsForObsidianLog(processorName: string, relationships: ExtractedRelationship[], processedUrls: ProcessedUrlInfo[], mergeDecisions: string[] = []): string {
let logContent = `\n---\n### Log Entry: ${new Date().toISOString()} for ${processorName}\n\n`;
if (mergeDecisions.length > 0) {
logContent += `#### Proactive Deduplication Decisions (${mergeDecisions.length}):\n`;
mergeDecisions.forEach(decision => {
logContent += `- ${decision}\n`;
});
logContent += "\n";
}
logContent += `#### Processed URLs (${processedUrls.length}):\n`;
if (processedUrls.length === 0) {
logContent += "- No URLs were processed.\n";
} else {
logContent += "| URL | Title | Type | Verified List? | Current? | Extracted # | Reasoning |\n";
logContent += "|---|---|---|---|---|---|---|\n";
processedUrls.forEach(url => {
const titleDisplay = this.scrubHyperlinks(url.title || "N/A").substring(0, 70);
const urlLink = url.url.startsWith("http") ? `[Link](${url.url})` : url.url;
const reasoningDisplay = this.scrubHyperlinks(url.verificationReasoning || "N/A").substring(0, 100);
logContent += `| ${urlLink} | ${titleDisplay}... | ${url.documentType || 'N/A'} | ${url.isList ? 'Yes' : 'No'} | ${url.isCurrent ? 'Yes' : 'No'} | ${url.extractedSubprocessorsCount || 0} | ${reasoningDisplay}... |\n`;
});
}
logContent += "\n";
logContent += `#### Extracted Relationships (${relationships.length}):\n`;
if (relationships.length === 0) {
logContent += "- No new relationships were extracted in this run.\n";
} else {
logContent += "| Primary Processor | Target Entity | Type | Function | Location | Source URL |\n";
logContent += "|---|---|---|---|---|---|\n";
relationships.forEach(rel => {
const targetEntityDisplay = this.scrubHyperlinks(rel.SubprocessorName).substring(0, 50);
const primaryProcDisplay = this.scrubHyperlinks(rel.PrimaryProcessor).substring(0, 50);
const funcDisplay = this.scrubHyperlinks(rel.ProcessingFunction).substring(0, 70);
const locDisplay = this.scrubHyperlinks(rel.Location).substring(0, 50);
const sourceUrlLink = rel.SourceURL.startsWith("http") ? `[Source](${rel.SourceURL})` : rel.SourceURL;
logContent += `| ${primaryProcDisplay} | ${targetEntityDisplay} | ${rel.RelationshipType} | ${funcDisplay}... | ${locDisplay}... | ${sourceUrlLink} |\n`;
});
}
logContent += "\n";
return logContent;
}
private async writeResultsToObsidianNote(
filePath: string, // Full path from vault root, e.g., "Analysis Logs/OpenAI Logs.md"
contentToAppendOrInitial: string,
mode: 'overwrite' | 'append' | 'ensure_exists_and_append' = 'ensure_exists_and_append',
processorNameForLogTitle?: string // Used if creating the file
) {
let file = this.app.vault.getAbstractFileByPath(filePath) as TFile;
if (!file && (mode === 'ensure_exists_and_append' || mode === 'overwrite')) {
// File doesn't exist, create it
let initialContent = "";
if (processorNameForLogTitle) {
initialContent += `# Analysis Log: ${processorNameForLogTitle}\n\n`;
}
initialContent += contentToAppendOrInitial; // Add the current log entry
try {
file = await this.app.vault.create(filePath, initialContent);
if (this.settings.verboseDebug) console.log(`Log file created: ${filePath}`);
} catch (e: any) {
if (e.message?.toLowerCase().includes("file already exists")) {
file = this.app.vault.getAbstractFileByPath(filePath) as TFile;
if (!file) { console.error(`Failed to get log file ${filePath} after 'already exists' error.`); return; }
// Now that file exists, proceed to append/process if mode is ensure_exists_and_append
} else {
console.error(`Error creating log file ${filePath}:`, e);
new Notice(`Error creating log file: ${filePath}`);
return; // Stop if creation fails
}
}
if (file && mode === 'ensure_exists_and_append') { /* File created with content, no further action for this call */ return; }
}
// If file exists (or was just created and mode is not 'ensure_exists_and_append' where content was initial)
if (file) {
if (mode === 'overwrite') {
let newContent = "";
if (processorNameForLogTitle) { // Keep the title if overwriting
newContent += `# Analysis Log: ${processorNameForLogTitle}\n\n`;
}
newContent += contentToAppendOrInitial;
await this.app.vault.modify(file, newContent);
if (this.settings.verboseDebug) console.log(`Log file overwritten: ${filePath}`);
} else if (mode === 'append' || (mode === 'ensure_exists_and_append' && file)) { // Append if mode is append or (ensure_exists_and_append and file already existed)
await this.app.vault.append(file, contentToAppendOrInitial);
if (this.settings.verboseDebug) console.log(`Content appended to log file: ${filePath}`);
}
} else if (mode === 'append') {
// File doesn't exist and mode is 'append' (strict append, not create)
new Notice(`Log file ${filePath} not found. Cannot append.`);
if (this.settings.verboseDebug) console.log(`Log file not found for append: ${filePath}`);
}
}
async getRightBrainAccessToken(): Promise<string | null> {
if (!this.settings.rightbrainClientId || !this.settings.rightbrainClientSecret) {
new Notice("RightBrain Client ID or Secret not configured.");
return null;
}
// Simple token cache (in-memory, expires after some time)
if ((this as any)._rbToken && (this as any)._rbTokenExpiry > Date.now()) {
if (this.settings.verboseDebug) console.log("Using cached RightBrain token.");
return (this as any)._rbToken;
}
const tokenUrl = 'https://oauth.leftbrain.me/oauth2/token'; // Corrected based on PDF
const bodyParams = new URLSearchParams();
bodyParams.append('grant_type', 'client_credentials');
// For client_secret_basic, credentials are in the Authorization header.
const credentials = `${this.settings.rightbrainClientId}:${this.settings.rightbrainClientSecret}`;
const encodedCredentials = btoa(credentials); // Base64 encode
const headers = {
'Authorization': `Basic ${encodedCredentials}`,
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': `ObsidianProcessorProcessorPlugin/${this.manifest.version}` // Good practice to include User-Agent
};
try {
if (this.settings.verboseDebug) console.log("Requesting new RightBrain token with client_secret_basic.");
const response = await requestUrl({
url: tokenUrl,
method: 'POST',
headers: headers,
body: bodyParams.toString(),
throw: false // Handle non-2xx responses manually
});
if (this.settings.verboseDebug) {
console.log(`RB Token Request Status: ${response.status}. Response Text Snippet: ${response.text ? response.text.substring(0, 200) : "No Body"}`);
}
if (response.status === 200 && response.json && response.json.access_token) {
if (this.settings.verboseDebug) console.log("Successfully obtained new RightBrain token.");
(this as any)._rbToken = response.json.access_token;
(this as any)._rbTokenExpiry = Date.now() + (response.json.expires_in || 3600) * 1000 - 600000; // Subtract 10 mins buffer
return response.json.access_token;
} else {
console.error("ProcessorProcessor: Failed to get RightBrain token.", response.status, response.text);
new Notice(`Failed to get RightBrain token: ${response.status}. Error: ${response.json?.error_description || response.text}. Check console.`);
(this as any)._rbToken = null; // Clear any stale token
(this as any)._rbTokenExpiry = 0;
return null;
}
} catch (error) {
console.error("ProcessorProcessor: Network error fetching RightBrain token:", error);
new Notice("Network error fetching RightBrain token. Check console.");
(this as any)._rbToken = null;
(this as any)._rbTokenExpiry = 0;
return null;
}
}
private generateSearchQueries(processorName: string): string[] {
// Sanitize processorName for use in queries (e.g., remove "Inc.", "LLC")
const cleanedName = processorName
.replace(/\b(?:inc\.?|llc\.?|ltd\.?|corp\.?|gmbh\.?|incorporated|limited|corporation)\b/gi, '')
.replace(/[,.]/g, '') // Remove commas and periods that might break search
.trim();
return [
`"${cleanedName}" sub-processor list`,
`"${cleanedName}" subprocessors`,
`"${cleanedName}" data processing addendum exhibit`,
`"${cleanedName}" DPA subprocessors`,
`"${cleanedName}" third-party vendors`,
`"${cleanedName}" service providers list`,
// More generic but sometimes useful for finding portals
`"${cleanedName}" trust center subprocessors`,
`"${cleanedName}" legal subprocessors`,
// If the name is short, broad searches might be too noisy.
// Consider adding quotes around cleanedName if it contains spaces.
];
}
private async searchSerpApiForDpas(processorName: string, queries: string[], maxResultsSetting: number): Promise<SerpApiResult[]> {
if (!this.settings.serpApiKey) {
new Notice("SerpAPI key not set. Cannot perform SerpAPI search.");
return [];
}
const allResults: SerpApiResult[] = [];
const processedUrls = new Set<string>(); // To avoid duplicate URLs from different queries
// Use a smaller number of queries for SerpAPI to manage cost/rate limits
const queriesToRun = queries.slice(0, Math.min(queries.length, 3)); // e.g., run first 3 queries
new Notice(`Searching SerpAPI for ${processorName} using ${queriesToRun.length} queries...`, 3000);
for (const query of queriesToRun) {
if (allResults.length >= maxResultsSetting && maxResultsSetting > 0) {
// if (this.settings.verboseDebug) console.log(`Max results (${maxResultsSetting}) reached for ${processorName}, stopping SerpAPI search.`);
break; // Stop if we've hit the overall max results desired (though logic is 1)
}
const params = new URLSearchParams({
api_key: this.settings.serpApiKey,
q: query,
engine: "google", // Or other engines like 'bing'
num: "10", // Number of results per query (max 100 for Google, usually 10-20 is fine)
// You can add other params like 'location', 'gl' (country), 'hl' (language) if needed
});
const serpApiUrl = `https://serpapi.com/search?${params.toString()}`;
try {
const response = await requestUrl({ url: serpApiUrl, method: 'GET', throw: false });
if (response.status === 200 && response.json && response.json.organic_results) {
const organicResults = response.json.organic_results;
for (const result of organicResults) {
if (result.link && !processedUrls.has(result.link)) {
const urlLower = result.link.toLowerCase();
const titleLower = result.title?.toLowerCase() || "";
const snippetLower = result.snippet?.toLowerCase() || "";
// Basic keyword check in URL, title, or snippet
const isRelevant = SUBPROCESSOR_URL_KEYWORDS.some(keyword =>
urlLower.includes(keyword) || titleLower.includes(keyword) || snippetLower.includes(keyword)
);
if (isRelevant) {
allResults.push({
processorName: processorName,
title: result.title || "No Title",
url: result.link,
snippet: result.snippet || "No Snippet",
searchQuery: query,
documentType: 'serpapi_dpa_or_subprocessor_list_candidate' // Mark as potential candidate
});
processedUrls.add(result.link);
if (allResults.length >= maxResultsSetting && maxResultsSetting > 0) break;
}
}
}
} else {
console.error(`SerpAPI error for query "${query}": ${response.status}`, response.text?.substring(0, 200));
new Notice(`SerpAPI query failed for "${query.substring(0,20)}...". Status: ${response.status}`);
}
} catch (error) {
console.error(`Network error during SerpAPI search for query "${query}":`, error);
new Notice(`Network error during SerpAPI search for "${query.substring(0,20)}...".`);
}
await new Promise(resolve => setTimeout(resolve, 500 + Math.random() * 300)); // Delay between API calls
}
if (this.settings.verboseDebug) console.log(`SerpAPI search for ${processorName} found ${allResults.length} relevant candidates.`);
return allResults;
}
private getCompanyDomain(processorName: string): string {
// Basic domain extraction - this is naive and can be improved.
// It assumes processorName might be like "Company Name Inc." or "company.com"
let name = processorName.toLowerCase();
name = name.replace(/\b(?:inc\.?|llc\.?|ltd\.?|corp\.?|gmbh\.?)\b/g, '').trim(); // Remove common suffixes
name = name.replace(/[,.]/g, ''); // Remove commas, periods
try {
// If it looks like a URL already
if (name.includes('.') && !name.includes(' ')) {
const url = new URL(name.startsWith('http') ? name : `http://${name}`);
return url.hostname.replace(/^www\./, ''); // Remove www.
}
} catch (e) { /* Not a valid URL, proceed */ }
// If it's a multi-word name, try to form a domain (e.g., "Company Name" -> "companyname.com")
// This is highly speculative and often wrong.
// A better approach is to look for official website in search results.
const parts = name.split(/\s+/);
if (parts.length > 1) {
// return parts.join('').toLowerCase() + ".com"; // Very naive
return ""; // Better to not guess if unsure
}
return name; // If single word, assume it might be part of a domain
}
private isValidUrl(url: string, processorNameContext: string = ""): boolean {
if (!url || typeof url !== 'string') return false;
try {
const parsedUrl = new URL(url);
// Allow http and https protocols
if (!['http:', 'https:'].includes(parsedUrl.protocol)) {
return false;
}
// Optional: check if domain seems related to processorNameContext if provided
if (processorNameContext) {
const processorDomain = this.getCompanyDomain(processorNameContext);
if (processorDomain && !parsedUrl.hostname.toLowerCase().includes(processorDomain.replace(/^www\./, ''))) {
// This is a soft check, might be too restrictive.
// if (this.settings.verboseDebug) console.log(`URL ${url} hostname ${parsedUrl.hostname} doesn't match context ${processorDomain}`);
}
}
return true;
} catch (e) {
return false; // Invalid URL format
}
}
private async extractUrlsFromDpaPage(pageUrl: string, processorNameContext: string, sourcePageTitle?: string): Promise<SerpApiResult[]> {
if (!this.settings.rightbrainVerifyUrlTaskId) { // Using verify task ID as a proxy for "RB is configured"
// if (this.settings.verboseDebug) console.log("RB not configured, skipping URL extraction from DPA page content.");
return [];
}
const rbToken = await this.getRightBrainAccessToken();
if (!rbToken) return [];
const extractedLinks: SerpApiResult[] = [];
// This would ideally use a RightBrain task designed to fetch a page and extract all <a> hrefs.
// For now, let's simulate a simplified version of what such a task might return if we pass pageUrl.
// A proper RB task would take `pageUrl` as input to a `url_fetcher` and then parse its HTML output.
// Simulate fetching page content (very basic, not robust)
let pageContent = "";
try {
const response = await requestUrl({url: pageUrl, method: 'GET', throw: false});
if (response.status === 200) {
pageContent = response.text;
} else {
// if (this.settings.verboseDebug) console.warn(`Failed to fetch ${pageUrl} for link extraction, status: ${response.status}`);
return [];
}
} catch (e) {
// if (this.settings.verboseDebug) console.error(`Error fetching ${pageUrl} for link extraction:`, e);
return [];
}
if (!pageContent) return [];
// Simple regex to find href attributes in <a> tags
const linkRegex = /<a\s+(?:[^>]*?\s+)?href="([^"]*)"/gi;
let match;
while ((match = linkRegex.exec(pageContent)) !== null) {
let href = match[1].trim();
if (href && !href.startsWith('#') && !href.startsWith('mailto:') && !href.startsWith('javascript:')) {
try {
const absoluteUrl = new URL(href, pageUrl).toString(); // Resolve relative URLs
if (this.isValidUrl(absoluteUrl, processorNameContext)) {
// Check if this link itself looks like a subprocessor list
const urlLower = absoluteUrl.toLowerCase();
const titleOrTextLower = (match[0].match(/>(.*?)</)?.[1] || "").toLowerCase(); // Get link text
const isPotentialSubprocessorList = SUBPROCESSOR_URL_KEYWORDS.some(keyword =>
urlLower.includes(keyword) || titleOrTextLower.includes(keyword)
);
if (isPotentialSubprocessorList) {
extractedLinks.push({
processorName: processorNameContext,
title: `Linked from: ${sourcePageTitle || pageUrl}`,
url: absoluteUrl,
snippet: `Found on page: ${pageUrl}`,
documentType: 'linked_subprocessor_list_candidate',
sourceDpaUrl: pageUrl
});
}
}
} catch (e) { /* Invalid URL, skip */ }
}
}
// if (this.settings.verboseDebug && extractedLinks.length > 0) {
// console.log(`Extracted ${extractedLinks.length} potential subprocessor list URLs from ${pageUrl}`);
// }
return extractedLinks;
}
private async callRightBrainTask(taskId: string, taskVariables: Record<string, any>, rbToken: string): Promise<any | null> {
if (!taskId) {
new Notice("RightBrain Task ID is missing for the call.");
console.error("ProcessorProcessor: Attempted to call RightBrain task with no Task ID.");
return null;
}
if (!this.settings.rightbrainOrgId || !this.settings.rightbrainProjectId) {
new Notice("RightBrain Org ID or Project ID not set. Cannot call task.");
console.error("ProcessorProcessor: RB OrgID or ProjectID missing for task call.");
return null;
}
const taskRunUrl = `https://stag.leftbrain.me/api/v1/org/${this.settings.rightbrainOrgId}/project/${this.settings.rightbrainProjectId}/task/${taskId}/run`;
const headers = {
'Authorization': `Bearer ${rbToken}`,
'Content-Type': 'application/json',
'User-Agent': `ObsidianProcessorProcessorPlugin/${this.manifest.version}`
};
// --- THIS IS THE NEW LOGIC ---
// Automatically wrap the provided variables in the required 'task_input' object.
const payload = {
task_input: taskVariables
};
// ----------------------------
try {
const response = await requestUrl({
url: taskRunUrl,
method: 'POST',
headers: headers,
body: JSON.stringify(payload), // Send the newly constructed payload
throw: false
});
if (response.json && (response.status === 200 || response.status === 201)) {
return response.json;
} else {
new Notice(`RightBrain Task ${taskId.substring(0,8)}... failed: ${response.status}. Check console.`, 7000);
console.error(`RB Task Call [${taskId}] Error: ${response.status}`, response.text ? response.text.substring(0, 1000) : "No body", "Payload Sent:", payload);
return null;
}
} catch (error: any) {
new Notice(`Network error calling RightBrain Task ${taskId.substring(0,8)}.... Check console.`, 7000);
console.error(`RB Task Call [${taskId}] Network Error:`, error, "Payload Sent:", payload);
return null;
}
}
private async verifySubprocessorListUrl(urlToVerify: string, processorName: string,rbToken: string): Promise<{ isList: boolean; isCurrent: boolean; isCorrectProcessor: boolean; reasoning: string; pageContent?: string } | null> {
if (!this.settings.rightbrainVerifyUrlTaskId) {
new Notice("RightBrain Verify URL Task ID is not configured. Cannot verify URL.");
return null;
}
// The input parameter name for the RB task ('url_to_verify', 'url_content', etc.)
// must match what the RB task definition expects.
// Assuming the task expects something like: { "url_content": "https://..." }
// And the url_fetcher input_processor is configured for "url_content"
const taskInput = {
"url_content": urlToVerify,
"expected_processor_name": processorName
}; // This will be fetched by url_fetcher
if (this.settings.verboseDebug) console.log(`Verifying URL ${urlToVerify} with RB Task ${this.settings.rightbrainVerifyUrlTaskId}. Input:`, JSON.stringify(taskInput));
const taskResult = await this.callRightBrainTask(this.settings.rightbrainVerifyUrlTaskId, taskInput, rbToken);
if (this.settings.verboseDebug) {
console.log(`RB Verify Task [${this.settings.rightbrainVerifyUrlTaskId}] Full Result for URL ${urlToVerify}:`, JSON.stringify(taskResult, null, 2));
}
// Process the taskResult
if (taskResult && typeof taskResult.response === 'object' && taskResult.response !== null) {
const rbResponse = taskResult.response;
const isList = String(rbResponse.isSubprocessorList).toLowerCase() === 'true';
const isCorrectProcessor = String(rbResponse.isCorrectProcessor).toLowerCase() === 'true';
const isCurrent = String(rbResponse.isCurrentVersion).toLowerCase() === 'true';
const reasoning = rbResponse.reasoning || "N/A";
// Attempt to get pageContent if url_fetcher was used and passed it through
let pageContent: string | undefined = undefined;
// Check common places where fetched HTML might be stored by RB/url_fetcher
if (taskResult.run_data && taskResult.run_data.submitted &&
typeof taskResult.run_data.submitted.url_content === 'string' &&
taskResult.run_data.submitted.url_content.toLowerCase().includes('<html')) { // Check if it looks like HTML
// This assumes 'url_content' in 'submitted' data is the fetched HTML if url_fetcher was used.
// This depends heavily on RB's internal structure for `run_wait_for_response`.
pageContent = taskResult.run_data.submitted.url_content;
if(this.settings.verboseDebug) console.log("Retrieved pageContent from run_data.submitted.url_content for verify task");
} else if (typeof rbResponse.fetched_page_html === 'string') { // If LLM explicitly passes it back
pageContent = rbResponse.fetched_page_html;
if(this.settings.verboseDebug) console.log("Retrieved pageContent from rbResponse.fetched_page_html for verify task");
} else if (typeof rbResponse.page_content === 'string') { // Another potential fallback
pageContent = rbResponse.page_content;
if(this.settings.verboseDebug) console.log("Retrieved pageContent from rbResponse.page_content (fallback) for verify task");
}
// More robust: if your RB task's input_processor (url_fetcher) stores its output in a known way within `taskResult.run_data.input_processor_outputs`, access it there.
// e.g., if (taskResult.run_data?.input_processor_outputs?.url_content?.text_content) { pageContent = taskResult.run_data.input_processor_outputs.url_content.text_content; }
if (this.settings.verboseDebug) {
console.log(`RB Verify for ${urlToVerify}: List=${isList}, Current=${isCurrent}, Content available: ${!!pageContent}, Content snippet: ${pageContent ? pageContent.substring(0,100) + "..." : "N/A"}`);
}
return { isList, isCurrent: (isList && isCurrent), isCorrectProcessor, reasoning, pageContent };
}
if (this.settings.verboseDebug) {
console.warn(`RB Verify task for ${urlToVerify} failed or returned unexpected response format. TaskResult:`, taskResult);
}
return null; // Verification failed or task output was not as expected
}
private async extractEntitiesFromPageContent(pageContent: string, rbToken: string): Promise<{ thirdPartySubprocessors: any[]; ownEntities: any[] } | null> {
if (!this.settings.rightbrainExtractEntitiesTaskId) {
new Notice("RB Extract Entities Task ID missing. Cannot extract from content.");
return null;
}
if (!pageContent.trim()) {
// if (this.settings.verboseDebug) console.log("Page content is empty, skipping entity extraction.");
return { thirdPartySubprocessors: [], ownEntities: [] }; // Return empty if no content
}
// The input field name for the RB task must match the task definition.
// e.g., if the task expects { "text_to_analyze": "..." }, use that here.
const taskInput = { [this.settings.rightbrainExtractInputField]: pageContent };
// if (this.settings.verboseDebug) console.log(`Extracting entities with RB Task ${this.settings.rightbrainExtractEntitiesTaskId}. Input snippet:`, pageContent.substring(0, 200) + "...");
const taskResult = await this.callRightBrainTask(this.settings.rightbrainExtractEntitiesTaskId, taskInput, rbToken);
// if (this.settings.verboseDebug && taskResult) {
// console.log(`RB Extract Entities Task Full Result:`, JSON.stringify(taskResult, null, 2));
// }
if (taskResult && typeof taskResult.response === 'object' && taskResult.response !== null) {
const rbResponse = taskResult.response;
// Access the arrays using the configured field names for third-party and own entities
const thirdPartySubprocessors = rbResponse[this.settings.rightbrainExtractOutputThirdPartyField] || [];
const ownEntities = rbResponse[this.settings.rightbrainExtractOutputOwnEntitiesField] || [];
// Ensure they are arrays
return {
thirdPartySubprocessors: Array.isArray(thirdPartySubprocessors) ? thirdPartySubprocessors : [],
ownEntities: Array.isArray(ownEntities) ? ownEntities : []
};
}
// if (this.settings.verboseDebug) {
// console.warn(`RB Extract Entities task failed or returned unexpected response format. TaskResult:`, taskResult);
// }
return null;
}
private async updateDiscoveryStatus(file: TFile, status: 'complete' | 'incomplete' | 'skipped') {
if (!file) return;
await this.app.vault.process(file, (content) => {
const updates: any = {
'discovery-status': status
};
if (status === 'complete') {
updates['last-discovered'] = new Date().toISOString().split('T')[0]; // YYYY-MM-DD format
}
return this.updateFrontmatter(content, updates, file.basename);
});
}
private async buildAliasMap(): Promise<Map<string, { path: string, canonicalName: string }>> {
const aliasMap = new Map<string, { path: string, canonicalName: string }>();
const processorsFolder = this.app.vault.getAbstractFileByPath(this.settings.processorsFolderPath) as TFolder;
if (!processorsFolder?.children) return aliasMap;
for (const file of processorsFolder.children) {
if (file instanceof TFile && file.extension === 'md') {
const cache = this.app.metadataCache.getFileCache(file);
const frontmatter = cache?.frontmatter || {};
const canonicalName = frontmatter.aliases?.[0] || file.basename;
const aliases = (frontmatter.aliases || []).map((a: string) => String(a).toLowerCase());
aliases.push(file.basename.toLowerCase());
for (const alias of new Set(aliases)) {
if (alias) {
aliasMap.set(alias, { path: file.path, canonicalName });
}
}
}
}
return aliasMap;
}
async runDeduplicationForFolder(folder: TFolder) {
new Notice(`Preparing to deduplicate pages in ${folder.path}...`);
if (!this.settings.rightbrainDeduplicateSubprocessorsTaskId) {
new Notice("Deduplication Task ID not set. Cannot proceed.");
return;
}
const rbToken = await this.getRightBrainAccessToken();
if (!rbToken) {
new Notice("Could not get RightBrain token for deduplication.");
return;
}
const files = folder.children.filter(f => f instanceof TFile && f.extension === 'md') as TFile[];
if (files.length < 2) {
new Notice("Not enough Markdown files in the folder to perform deduplication.");
return;
}
const subprocessorPagesInfo: SubprocessorPageInfo[] = [];
for (const file of files) {
const fileCache = this.app.metadataCache.getFileCache(file);
const frontmatter = fileCache?.frontmatter;
const aliases = (frontmatter?.aliases && Array.isArray(frontmatter.aliases)) ? frontmatter.aliases.map(String) : [];
if (frontmatter?.company_name) aliases.push(String(frontmatter.company_name)); // Include company_name if present
aliases.push(file.basename); // Include basename as an alias
subprocessorPagesInfo.push({
file_path: file.path,
page_name: file.basename, // Or a more canonical name from frontmatter if available
aliases: Array.from(new Set(aliases.filter(a => a))) // Unique, non-empty aliases
});
}
if (subprocessorPagesInfo.length < 2) {
new Notice("Not enough processable pages with aliases found for deduplication.");
return;
}
const taskInputPayload = {
subprocessor_pages: subprocessorPagesInfo,
// Optional: Add a threshold or other parameters if your RB task supports them
// "similarity_threshold": 0.8
};
new Notice(`Sending ${subprocessorPagesInfo.length} pages to RightBrain for deduplication analysis... This may take a while.`);
// if(this.settings.verboseDebug) console.log("Deduplication payload:", JSON.stringify(taskInputPayload));
const taskResult = await this.callRightBrainTask(this.settings.rightbrainDeduplicateSubprocessorsTaskId, taskInputPayload, rbToken);
// if(this.settings.verboseDebug && taskResult) {
// console.log("Deduplication Task Full Result:", JSON.stringify(taskResult, null, 2));
// }
if (taskResult && taskResult.response && Array.isArray(taskResult.response.deduplication_results)) {
const deduplicationResults: DeduplicationResultItem[] = taskResult.response.deduplication_results;
if (deduplicationResults.length === 0) {
new Notice("No duplicates found by RightBrain task.");
return;
}
new Notice(`Deduplication analysis complete. Found ${deduplicationResults.length} potential duplicate sets. Processing merges...`);
await this.processDeduplicationResults(deduplicationResults);
} else {
new Notice("Deduplication task failed or returned an unexpected response. Check console.");
console.error("Deduplication task error. Response:", taskResult);
}
}
async processDeduplicationResults(results: DeduplicationResultItem[]) {
let mergeCount = 0;
for (const resultSet of results) {
if (!resultSet.survivor_file_path || resultSet.duplicate_file_paths.length === 0) {
if (this.settings.verboseDebug) console.warn("Skipping invalid deduplication result set:", resultSet);
continue;
}
const survivorFile = this.app.vault.getAbstractFileByPath(resultSet.survivor_file_path) as TFile;
if (!survivorFile) {
if (this.settings.verboseDebug) console.warn(`Survivor file not found: ${resultSet.survivor_file_path}`);
continue;
}
const originalSurvivorContent = await this.app.vault.read(survivorFile);
// --- Step 1: Gather all data from survivor and duplicates ---
// Gather aliases and rows from the survivor file first
const survivorCache = this.app.metadataCache.getFileCache(survivorFile);
const allAliases = new Set<string>((survivorCache?.frontmatter?.aliases || []).map(String));
allAliases.add(survivorFile.basename);
const allRows = new Set<string>(this.extractClientTableRows(originalSurvivorContent));
// Now, loop through duplicates to gather their data
for (const dupFilePath of resultSet.duplicate_file_paths) {
if (dupFilePath === survivorFile.path) continue;
const dupFile = this.app.vault.getAbstractFileByPath(dupFilePath) as TFile;
if (dupFile) {
const dupContent = await this.app.vault.read(dupFile);
const dupCache = this.app.metadataCache.getFileCache(dupFile);
// Add duplicate's aliases and basename to the set
(dupCache?.frontmatter?.aliases || []).map(String).forEach(alias => allAliases.add(alias));
allAliases.add(dupFile.basename);
// Add duplicate's "Used By" rows to the set
this.extractClientTableRows(dupContent).forEach(row => allRows.add(row));
try {
await this.app.vault.delete(dupFile);
} catch (e) {
console.error(`Failed to delete duplicate file ${dupFilePath}:`, e);
}
}
}
// --- Step 2: Rebuild the file from scratch with merged data ---
// 2A: Isolate the original body of the survivor file (everything after the frontmatter)
const fmRegex = /^---\s*\n([\s\S]*?)\n---\s*\n/;
const match = originalSurvivorContent.match(fmRegex);
let survivorBody = match ? originalSurvivorContent.substring(match[0].length) : originalSurvivorContent;
// 2B: Rebuild the frontmatter string with all merged aliases
const existingTags = new Set<string>((survivorCache?.frontmatter?.tags || []).map(String));
let newFmString = "---\n";
newFmString += `aliases: [${Array.from(allAliases).map(a => `"${a.replace(/"/g, '\\"')}"`).join(', ')}]\n`;
if (existingTags.size > 0) {
newFmString += `tags: [${Array.from(existingTags).map(t => `"${t}"`).join(', ')}]\n`;
}
newFmString += "---\n";
// 2C: Rebuild the "Used By" table markdown string from the merged rows
let clientTableMd = "";
if (allRows.size > 0) {
clientTableMd += `| Primary Processor | Processing Function | Location | Source URL |\n`;
clientTableMd += `|---|---|---|---|\n`;
allRows.forEach(row => {
clientTableMd += `|${row}|\n`;
});
}
// 2D: Replace the "Used By" section within the isolated body
const finalBody = this.ensureHeadingAndSection(survivorBody, "Used By", clientTableMd, null, null);
// 2E: Assemble the final, complete content
const finalContent = newFmString + finalBody;
// --- Step 3: Write the final content back to the survivor file ---
await this.app.vault.modify(survivorFile, finalContent);
mergeCount++;
new Notice(`Merged ${resultSet.duplicate_file_paths.length} duplicate(s) into ${survivorFile.basename}.`);
}
if (mergeCount > 0) {
new Notice(`Deduplication finished. ${mergeCount} merge operations performed.`);
} else {
new Notice("Deduplication process finished, but no actionable merges were made.");
}
}
async processManualMerge(survivorFile: TFile, duplicateFiles: TFile[]) {
if (!survivorFile || duplicateFiles.length === 0) {
new Notice("Merge cancelled: No survivor or duplicates selected.");
return;
}
new Notice(`Merging ${duplicateFiles.length} file(s) into ${survivorFile.basename}...`, 6000);
try {
const originalSurvivorContent = await this.app.vault.read(survivorFile);
// --- Step 1: Gather all data from survivor and duplicates ---
const survivorCache = this.app.metadataCache.getFileCache(survivorFile);
const allAliases = new Set<string>((survivorCache?.frontmatter?.aliases || []).map(String));
allAliases.add(survivorFile.basename); // Add survivor's own name
const allRows = new Set<string>(this.extractClientTableRows(originalSurvivorContent));
for (const dupFile of duplicateFiles) {
const dupContent = await this.app.vault.read(dupFile);
const dupCache = this.app.metadataCache.getFileCache(dupFile);
// Add duplicate's aliases and basename to the set
(dupCache?.frontmatter?.aliases || []).map(String).forEach(alias => allAliases.add(alias));
allAliases.add(dupFile.basename);
// Add duplicate's "Used By" table rows to the set
this.extractClientTableRows(dupContent).forEach(row => allRows.add(row));
}
// --- Step 2: Rebuild the survivor file with merged data ---
const fmRegex = /^---\s*\n([\s\S]*?)\n---\s*\n/;
const match = originalSurvivorContent.match(fmRegex);
let survivorBody = match ? originalSurvivorContent.substring(match[0].length) : originalSurvivorContent;
// Rebuild frontmatter
const existingTags = new Set<string>((survivorCache?.frontmatter?.tags || []).map(String));
let newFmString = "---\n";
newFmString += `aliases: [${Array.from(allAliases).map(a => `"${a.replace(/"/g, '\\"')}"`).join(', ')}]\n`;
if (existingTags.size > 0) {
newFmString += `tags: [${Array.from(existingTags).map(t => `"${t}"`).join(', ')}]\n`;
}
newFmString += "---\n";
// Rebuild "Used By" table
let clientTableMd = "";
if (allRows.size > 0) {
clientTableMd += `| Primary Processor | Processing Function | Location | Source URL |\n`;
clientTableMd += `|---|---|---|---|\n`;
allRows.forEach(row => {
clientTableMd += `|${row}|\n`;
});
}
// Replace the "Used By" section within the survivor's body
const finalBody = this.ensureHeadingAndSection(survivorBody, "Used By", clientTableMd, null, null);
const finalContent = newFmString + finalBody;
// --- Step 3: Write to survivor and delete duplicates ---
await this.app.vault.modify(survivorFile, finalContent);
for (const dupFile of duplicateFiles) {
await this.app.vault.delete(dupFile);
}
new Notice(`Successfully merged ${duplicateFiles.length} file(s) into ${survivorFile.basename}.`);
} catch (error) {
console.error("Error during manual merge:", error);
new Notice("An error occurred during the merge. Check the developer console.");
}
}
private extractClientTableRows(content: string): string[] {
const rows: string[] = [];
const lines = content.split('\n');
let inUsedBySection = false;
let tableHasStarted = false;
for (const line of lines) {
// Find the heading to start the process
if (line.match(/^##+\s*Used By\s*$/i)) {
inUsedBySection = true;
tableHasStarted = false; // Reset in case of multiple "Used By" sections
continue;
}
// Once we are in the right section, look for the table
if (inUsedBySection) {
const trimmedLine = line.trim();
// Stop if we hit another heading of the same or higher level
if (trimmedLine.startsWith('##')) {
inUsedBySection = false;
break;
}
// Find the table separator to begin capturing rows
if (trimmedLine.match(/^\|---\|/)) {
tableHasStarted = true;
continue;
}
// If the table has started, capture valid row content
if (tableHasStarted && trimmedLine.startsWith('|') && trimmedLine.endsWith('|')) {
// Extract content between the first and last pipe
const match = trimmedLine.match(/^\|(.*)\|$/);
if (match && match[1]) {
// Check that it's a content row, not another separator
if (!match[1].match(/^---\|/)) {
rows.push(match[1]);
}
}
} else if (tableHasStarted && trimmedLine !== "") {
// If the table had started and we find a non-empty, non-table row, assume the table has ended.
break;
}
}
}
return rows;
}
async discoverRecursively(initialProcessorName: string, initialProcessorFile?: TFile, maxDepth: number = 3) {
new Notice(`Starting smart recursive discovery for: ${initialProcessorName}. Max depth: ${maxDepth}`, 10000);
const aliasMap = await this.buildAliasMap();
this.processedInCurrentRecursiveSearch = new Set<string>();
const queue: Array<{ processorName: string, depth: number }> = [{ processorName: initialProcessorName, depth: 0 }];
let discoveredCount = 0;
let skippedCount = 0;
while (queue.length > 0) {
const current = queue.shift();
if (!current) continue;
let { processorName, depth } = current;
// --- State-Aware Processing Check ---
const existingEntity = aliasMap.get(processorName.toLowerCase());
let currentProcessorFile = existingEntity ? this.app.vault.getAbstractFileByPath(existingEntity.path) as TFile : null;
if (currentProcessorFile) {
const cache = this.app.metadataCache.getFileCache(currentProcessorFile);
if (cache?.frontmatter?.['discovery-status'] === 'complete' && cache?.frontmatter?.['last-discovered']) {
const lastRun = new Date(cache.frontmatter['last-discovered']);
const expiryDate = new Date();
expiryDate.setDate(expiryDate.getDate() - this.settings.discoveryCacheDays);
if (lastRun > expiryDate) {
if (this.settings.verboseDebug) console.log(`Skipping recently processed: ${processorName}`);
skippedCount++;
continue;
}
}
}
new Notice(`Recursive (depth ${depth}): Processing ${processorName}...`);
const { filePathName: sanitizedNameForTracking } = this.sanitizeNameForFilePathAndAlias(processorName);
if (this.processedInCurrentRecursiveSearch.has(sanitizedNameForTracking)) continue;
this.processedInCurrentRecursiveSearch.add(sanitizedNameForTracking);
const isTopLevel = depth === 0;
if (!currentProcessorFile) {
currentProcessorFile = await this.ensureProcessorFile(processorName, true, isTopLevel);
}
if (!currentProcessorFile) continue;
discoveredCount++;
const searchData = await this.fetchProcessorSearchDataWithDiscovery(processorName);
if (searchData?.collectedRelationships) {
const directSubNames = Array.from(new Set(searchData.collectedRelationships
.filter(rel => rel.PrimaryProcessor === processorName && rel.RelationshipType === 'uses_subprocessor')
.map(rel => rel.SubprocessorName.trim())
.filter(name => name)));
const mergeDecisionsLog: string[] = [];
if (depth < maxDepth - 1) {
for (const subName of directSubNames) {
const sanitizedSubNameForTracking = this.sanitizeNameForFilePathAndAlias(subName).filePathName;
if (this.processedInCurrentRecursiveSearch.has(sanitizedSubNameForTracking)) continue;
const existingMapping = aliasMap.get(subName.toLowerCase());
let nameToQueue = subName;
if (existingMapping) {
nameToQueue = existingMapping.canonicalName;
if (subName !== nameToQueue) {
const decision = `Mapped discovered name "${subName}" to existing processor "${nameToQueue}".`;
mergeDecisionsLog.push(decision);
}
} else {
// It's a new entity, add it to our map for this run to catch duplicates within the same run
const { filePathName, originalNameAsAlias } = this.sanitizeNameForFilePathAndAlias(subName);
const newPath = `${this.settings.processorsFolderPath}/${filePathName}.md`;
aliasMap.set(subName.toLowerCase(), { path: newPath, canonicalName: originalNameAsAlias });
}
if (!queue.some(q => q.processorName === nameToQueue)) {
queue.push({ processorName: nameToQueue, depth: depth + 1 });
}
}
}
await this.persistSubprocessorInfo(processorName, currentProcessorFile, searchData, isTopLevel, mergeDecisionsLog);
await this.updateDiscoveryStatus(currentProcessorFile, 'complete');
} else {
await this.updateDiscoveryStatus(currentProcessorFile, 'incomplete');
}
await new Promise(resolve => setTimeout(resolve, 500));
}
new Notice(`Recursive discovery complete. Processed ${discoveredCount} entities, skipped ${skippedCount} recent ones.`, 10000);
this.processedInCurrentRecursiveSearch.clear();
}
private openFileSelectorMergeModal() {
const files = this.app.vault.getMarkdownFiles().filter(file => file.path.startsWith(this.settings.processorsFolderPath + "/"));
if (files.length < 2) {
new Notice("There are not enough processor files to perform a merge.");
return;
}
new FileSelectorMergeModal(this.app, files, (selectedFiles) => {
// After the user selects files, we open the second modal to choose the survivor.
new ForceMergeModal(this.app, selectedFiles, (survivor, duplicates) => {
this.processManualMerge(survivor, duplicates);
}).open();
}).open();
}
}
// ----- MODAL CLASSES -----
class ManualInputModal extends Modal {
processorName: string = '';
listUrl: string = '';
isPrimaryProcessor: boolean = true; // <-- New state variable, defaults to true
onSubmit: (processorName: string, listUrl: string, isPrimary: boolean) => Promise<void>; // <-- Updated signature
initialProcessorName?: string;
constructor(app: App, onSubmit: (processorName: string, listUrl: string, isPrimary: boolean) => Promise<void>, initialProcessorName?: string) {
super(app);
this.onSubmit = onSubmit;
this.initialProcessorName = initialProcessorName;
if (this.initialProcessorName) {
this.processorName = this.initialProcessorName;
}
}
onOpen() {
const { contentEl } = this;
contentEl.createEl('h2', { text: 'Manually Add Subprocessor List URL' });
new Setting(contentEl)
.setName('Processor Name')
.setDesc('Enter the name of the primary processor (e.g., OpenAI).')
.addText(text => {
text.setPlaceholder('Enter processor name')
.setValue(this.processorName)
.onChange(value => this.processorName = value)
.inputEl.setAttr("required", "true");
if (this.initialProcessorName) {
text.setDisabled(true);
}
});
new Setting(contentEl)
.setName('Subprocessor List URL')
.setDesc('Enter the direct URL to the subprocessor list or DPA page.')
.addText(text =>
text.setPlaceholder('https://example.com/subprocessors')
.setValue(this.listUrl)
.onChange(value => this.listUrl = value)
.inputEl.setAttr("required", "true"));
// New "Is Primary" Toggle Setting
new Setting(contentEl)
.setName('Is a primary processor?')
.setDesc('Enable this if you are initiating a search on this processor. Disable if you are adding a subprocessor of another entity.')
.addToggle(toggle => toggle
.setValue(this.isPrimaryProcessor)
.onChange(value => this.isPrimaryProcessor = value));
new Setting(contentEl)
.addButton(button =>
button.setButtonText('Process URL')
.setCta()
.onClick(() => {
// ... validation checks ...
this.close();
this.onSubmit(this.processorName, this.listUrl, this.isPrimaryProcessor); // <-- Pass the new flag
}));
}
onClose() {
this.contentEl.empty();
}
}
class SearchModal extends Modal {
processorName: string = '';
settings: ProcessorProcessorSettings; // To inform user about search method
onSubmit: (processorName: string) => Promise<void>;
constructor(app: App, settings: ProcessorProcessorSettings, onSubmit: (processorName: string) => Promise<void>) {
super(app);
this.settings = settings;
this.onSubmit = onSubmit;
}
onOpen() {
const { contentEl } = this;
contentEl.createEl('h2', { text: 'Discover Subprocessors' });
let searchMethodNote = "Search will be performed using available configured methods.";
if (this.settings.serpApiKey) {
searchMethodNote = "Search will primarily use SerpAPI.";
} else if (this.settings.rightbrainOrgId && this.settings.rightbrainProjectId && this.settings.rightbrainDuckDuckGoSearchTaskId) {
searchMethodNote = "SerpAPI key not found. Search will use DuckDuckGo via RightBrain.";
} else {
searchMethodNote = "Neither SerpAPI nor RightBrain DuckDuckGo search is fully configured. Discovery might be limited.";
}
contentEl.createEl('p', { text: searchMethodNote });
new Setting(contentEl)
.setName('Processor Name')
.setDesc('Enter the name of the processor to search for (e.g., Stripe).')
.addText(text =>
text.setPlaceholder('Enter processor name')
.setValue(this.processorName)
.onChange(value => this.processorName = value)
.inputEl.setAttr("required", "true"));
new Setting(contentEl)
.addButton(button =>
button.setButtonText('Start Discovery')
.setCta()
.onClick(() => {
if (!this.processorName.trim()) {
new Notice("Processor Name is required.");
return;
}
this.close();
this.onSubmit(this.processorName);
}));
}
onClose() {
this.contentEl.empty();
}
}
class ManualTextEntryModal extends Modal {
processorName: string = '';
pastedText: string = '';
isPrimaryProcessor: boolean = true; // <-- New state variable, defaults to true
onSubmit: (processorName: string, pastedText: string, isPrimary: boolean) => Promise<void>; // <-- Updated signature
initialProcessorName?: string;
constructor(app: App, onSubmit: (processorName: string, pastedText: string, isPrimary: boolean) => Promise<void>, initialProcessorName?: string) {
super(app);
this.onSubmit = onSubmit;
this.initialProcessorName = initialProcessorName;
if (this.initialProcessorName) {
this.processorName = this.initialProcessorName;
}
}
onOpen() {
const { contentEl } = this;
contentEl.createEl('h2', { text: 'Input Subprocessor List from Text' });
new Setting(contentEl)
.setName('Processor Name')
.setDesc('Enter the name of the primary processor this text belongs to.')
.addText(text => {
text.setPlaceholder('Enter processor name')
.setValue(this.processorName)
.onChange(value => this.processorName = value)
.inputEl.setAttr("required", "true");
if (this.initialProcessorName) {
text.setDisabled(true);
}
});
new Setting(contentEl)
.setName('Is a primary processor?')
.setDesc('Enable this if you are initiating a search on this processor. Disable if you are adding a subprocessor of another entity.')
.addToggle(toggle => toggle
.setValue(this.isPrimaryProcessor)
.onChange(value => this.isPrimaryProcessor = value));
contentEl.createEl('p', { text: 'Paste the subprocessor list text below:' });
const textArea = new TextAreaComponent(contentEl)
.setPlaceholder('Paste text here...')
.setValue(this.pastedText)
.onChange(value => this.pastedText = value);
textArea.inputEl.rows = 10;
textArea.inputEl.style.width = '100%';
textArea.inputEl.setAttr("required", "true");
new Setting(contentEl)
.addButton(button =>
button.setButtonText('Process Text')
.setCta()
.onClick(() => {
// ... validation checks ...
this.close();
this.onSubmit(this.processorName, this.pastedText, this.isPrimaryProcessor); // <-- Pass the new flag
}));
}
onClose() {
this.contentEl.empty();
}
}
class ForceMergeModal extends Modal {
files: TFile[];
onSubmit: (survivor: TFile, duplicates: TFile[]) => void;
private survivor: TFile | null = null;
constructor(app: App, files: TFile[], onSubmit: (survivor: TFile, duplicates: TFile[]) => void) {
super(app);
// Ensure files are sorted alphabetically for the user
this.files = files.sort((a, b) => a.basename.localeCompare(b.basename));
this.onSubmit = onSubmit;
}
onOpen() {
const { contentEl } = this;
contentEl.createEl('h2', { text: 'Force Merge Processors' });
contentEl.createEl('p', { text: 'Select the file to keep (the "survivor"). All other selected files will be merged into it and then deleted.' });
let mergeButton: ButtonComponent;
const radioGroup = contentEl.createDiv();
this.files.forEach(file => {
const setting = new Setting(radioGroup)
.setName(file.basename)
.setDesc(file.path);
// This creates a RADIO BUTTON for single selection
const radio = createEl('input', {
type: 'radio',
cls: 'force-merge-radio'
});
radio.name = "survivor-selection";
radio.value = file.path;
radio.onchange = () => {
this.survivor = file;
// This correctly enables the merge button
mergeButton.setDisabled(false).setCta(true);
};
setting.controlEl.appendChild(radio);
});
new Setting(contentEl)
.addButton(btn => btn
.setButtonText('Cancel')
.onClick(() => this.close()))
.addButton(btn => {
mergeButton = btn;
btn.setButtonText('Merge')
.setDisabled(true)
.onClick(() => {
if (this.survivor) {
const duplicates = this.files.filter(f => f.path !== this.survivor!.path);
this.close();
this.onSubmit(this.survivor, duplicates);
}
});
});
}
onClose() {
this.contentEl.empty();
}
}
class FileSelectorMergeModal extends Modal {
files: TFile[];
onSubmit: (selectedFiles: TFile[]) => void;
private selectedFilePaths: Set<string> = new Set();
constructor(app: App, files: TFile[], onSubmit: (selectedFiles: TFile[]) => void) {
super(app);
this.files = files.sort((a, b) => a.basename.localeCompare(b.basename));
this.onSubmit = onSubmit;
}
onOpen() {
const { contentEl } = this;
contentEl.createEl('h2', { text: 'Select Files to Merge' });
contentEl.createEl('p', { text: 'Choose two or more processor files from the list below.' });
let nextButton: ButtonComponent;
const checkboxGroup = contentEl.createDiv();
checkboxGroup.addClass('processor-file-selector-list');
this.files.forEach(file => {
const setting = new Setting(checkboxGroup)
.setName(file.basename)
.setDesc(file.path);
// This creates a CHECKBOX for multiple selections
setting.addToggle(toggle => {
toggle.onChange(value => {
if (value) {
this.selectedFilePaths.add(file.path);
} else {
this.selectedFilePaths.delete(file.path);
}
// This correctly enables the button when 2 or more are selected
nextButton.setDisabled(this.selectedFilePaths.size < 2);
});
});
});
new Setting(contentEl)
.addButton(btn => btn
.setButtonText('Cancel')
.onClick(() => this.close()))
.addButton(btn => {
nextButton = btn;
btn.setButtonText('Next: Select Survivor')
.setCta()
.setDisabled(true)
.onClick(() => {
const selectedFiles = this.files.filter(f => this.selectedFilePaths.has(f.path));
this.close();
this.onSubmit(selectedFiles);
});
});
}
}
class PasteEnvModal extends Modal {
pastedText: string = '';
plugin: ProcessorProcessorPlugin;
constructor(app: App, plugin: ProcessorProcessorPlugin) {
super(app);
this.plugin = plugin;
}
onOpen() {
const { contentEl } = this;
contentEl.createEl('h2', { text: 'Complete Plugin Setup' });
contentEl.createEl('p', { text: 'Paste the entire block of environment variables from your RightBrain dashboard below. This will save your credentials and then automatically create the necessary AI tasks in your project.' });
const textArea = new TextAreaComponent(contentEl)
.setPlaceholder('RB_ORG_ID="..."\nRB_PROJECT_ID="..."')
.onChange(value => this.pastedText = value);
textArea.inputEl.rows = 12;
textArea.inputEl.style.width = '100%';
textArea.inputEl.style.fontFamily = 'monospace';
new Setting(contentEl)
.addButton(button =>
button.setButtonText('Begin Setup')
.setCta()
.onClick(() => {
if (this.pastedText.trim()) {
// This now triggers the entire setup flow
this.runFullSetup();
this.close();
} else {
new Notice("Text area is empty.");
}
}));
}
onClose() {
this.contentEl.empty();
}
/**
* Parses the pasted text, saves credentials, then proceeds to set up tasks.
*/
async runFullSetup() {
// --- Part 1: Parse and Save Credentials ---
const lines = this.pastedText.trim().split('\n');
const settingsToUpdate: Partial<ProcessorProcessorSettings> = {};
let credsFoundCount = 0;
const keyMap: { [key: string]: keyof ProcessorProcessorSettings } = {
'RB_ORG_ID': 'rightbrainOrgId',
'RB_PROJECT_ID': 'rightbrainProjectId',
'RB_CLIENT_ID': 'rightbrainClientId',
'RB_CLIENT_SECRET': 'rightbrainClientSecret'
};
for (const line of lines) {
const parts = line.split('=');
if (parts.length < 2) continue;
const key = parts[0].trim();
let value = parts.slice(1).join('=').trim().replace(/["']/g, ''); // Remove quotes
if (key in keyMap && value) {
const settingKey = keyMap[key];
(settingsToUpdate as any)[settingKey] = value;
credsFoundCount++;
}
}
if (credsFoundCount < 4) {
new Notice("Setup failed. Could not find all required credentials (ORG_ID, PROJECT_ID, CLIENT_ID, CLIENT_SECRET) in the pasted text.");
return;
}
this.plugin.settings = Object.assign(this.plugin.settings, settingsToUpdate);
await this.plugin.saveSettings();
new Notice(`Successfully updated ${credsFoundCount} credentials.`);
// --- Part 2: Call the Task Setup Logic ---
// We can now call the function directly, as the settings are saved.
// A small delay helps the user read the first notice.
await new Promise(resolve => setTimeout(resolve, 1000));
// This function already exists in your plugin class
await this.plugin.setupRightBrainTasks();
}
}
// ----- SETTINGS TAB CLASS -----
class ProcessorProcessorSettingTab extends PluginSettingTab {
plugin: ProcessorProcessorPlugin;
constructor(app: App, plugin: ProcessorProcessorPlugin) {
super(app, plugin);
this.plugin = plugin;
}
display(): void {
const { containerEl } = this;
containerEl.empty();
containerEl.createEl('h2', { text: 'Processor Processor Settings' });
// --- API Keys & Credentials ---
containerEl.createEl('h3', { text: 'API Keys & Credentials' });
new Setting(containerEl)
.setName('SerpAPI Key')
.setDesc('Your SerpAPI Key for Google search functionality.')
.addText(text => text
.setPlaceholder('Enter your SerpAPI key')
.setValue(this.plugin.settings.serpApiKey)
.onChange(async (value) => {
this.plugin.settings.serpApiKey = value;
await this.plugin.saveSettings();
}));
// --- RightBrain Configuration ---
containerEl.createEl('h3', { text: 'RightBrain Task Configuration' });
new Setting(containerEl)
.setName('RB Extract Entities: Input Field Name')
.setDesc('The parameter name your RB Extract Entities task expects for the input text (e.g., "page_text", "document_content").')
.addText(text => text
.setValue(this.plugin.settings.rightbrainExtractInputField)
.setPlaceholder('e.g., page_text')
.onChange(async (value) => {
this.plugin.settings.rightbrainExtractInputField = value;
await this.plugin.saveSettings();
}));
new Setting(containerEl)
.setName('RB Extract Entities: Output Field (Third-Party)')
.setDesc('The field name in your RB Extract Entities task\'s JSON output for the list of third-party subprocessors (e.g., "third_party_subprocessors").')
.addText(text => text
.setValue(this.plugin.settings.rightbrainExtractOutputThirdPartyField)
.setPlaceholder('e.g., third_party_subprocessors')
.onChange(async (value) => {
this.plugin.settings.rightbrainExtractOutputThirdPartyField = value;
await this.plugin.saveSettings();
}));
new Setting(containerEl)
.setName('RB Extract Entities: Output Field (Own Entities)')
.setDesc('The field name in your RB Extract Entities task\'s JSON output for the list of own/affiliated entities (e.g., "own_entities").')
.addText(text => text
.setValue(this.plugin.settings.rightbrainExtractOutputOwnEntitiesField)
.setPlaceholder('e.g., own_entities')
.onChange(async (value) => {
this.plugin.settings.rightbrainExtractOutputOwnEntitiesField = value;
await this.plugin.saveSettings();
}));
// --- General Settings ---
containerEl.createEl('h3', { text: 'General Settings' });
new Setting(containerEl)
.setName('Create Pages for Own Entities')
.setDesc('If enabled, separate Markdown pages will also be created for "own entities" identified during processing, not just third-party subprocessors.')
.addToggle(toggle => toggle
.setValue(this.plugin.settings.createPagesForOwnEntities)
.onChange(async (value) => {
this.plugin.settings.createPagesForOwnEntities = value;
await this.plugin.saveSettings();
}));
new Setting(containerEl)
.setName('Verbose Debug Logging')
.setDesc('Enable detailed logging to the developer console for debugging purposes.')
.addToggle(toggle => toggle
.setValue(this.plugin.settings.verboseDebug)
.onChange(async (value) => {
this.plugin.settings.verboseDebug = value;
await this.plugin.saveSettings();
}));
// maxResultsPerProcessor is not typically user-configurable if it's fixed for "stop on first true/true" logic
// If it were, it would be an addText or addSlider
new Setting(containerEl)
.setName('Max Results Per Processor (Discovery)')
.setDesc('Maximum search results to process for each processor during initial discovery. Currently, the logic stops on the first verified list, effectively making this 1.')
.addText(text => text
.setValue(this.plugin.settings.maxResultsPerProcessor.toString())
.setDisabled(true) // Since the logic is hardcoded to stop on first verified
.onChange(async (value) => {
// This setting is mostly informational due to current logic
// const num = parseInt(value);
// if (!isNaN(num) && num > 0) {
// this.plugin.settings.maxResultsPerProcessor = num;
// await this.plugin.saveSettings();
// }
}));
new Setting(containerEl)
.setName('Mapping Depth')
.setDesc('Set the maximum depth for the Map Subprocessor Relationships function (e.g., 2-5). Higher numbers will take much longer and use more API calls.')
.addText(text => text
.setPlaceholder('e.g., 3')
.setValue(this.plugin.settings.maxRecursiveDepth.toString())
.onChange(async (value) => {
const num = parseInt(value);
if (!isNaN(num) && num > 0) {
this.plugin.settings.maxRecursiveDepth = num;
await this.plugin.saveSettings();
}
}));
new Setting(containerEl)
.setName('Discovery Cache Duration (Days)')
.setDesc('How many days to consider a processor\'s data "fresh". A processor with a "complete" status discovered within this period will be skipped during recursive runs.')
.addText(text => text
.setPlaceholder('e.g., 30')
.setValue(this.plugin.settings.discoveryCacheDays.toString())
.onChange(async (value) => {
const num = parseInt(value);
if (!isNaN(num) && num >= 0) {
this.plugin.settings.discoveryCacheDays = num;
await this.plugin.saveSettings();
}
}));
new Setting(containerEl)
.setName('Processors Folder Path')
.setDesc('Path to the folder where processor and subprocessor notes will be stored (e.g., "Third Parties/Processors").')
.addText(text => text
.setPlaceholder('e.g., Processors')
.setValue(this.plugin.settings.processorsFolderPath)
.onChange(async (value) => {
this.plugin.settings.processorsFolderPath = value || DEFAULT_SETTINGS.processorsFolderPath;
await this.plugin.saveSettings();
}));
new Setting(containerEl)
.setName('Analysis Logs Folder Path')
.setDesc('Path to the folder where analysis log notes for each processor will be stored (e.g., "Compliance/Logs").')
.addText(text => text
.setPlaceholder('e.g., Analysis Logs')
.setValue(this.plugin.settings.analysisLogsFolderPath)
.onChange(async (value) => {
this.plugin.settings.analysisLogsFolderPath = value || DEFAULT_SETTINGS.analysisLogsFolderPath;
await this.plugin.saveSettings();
}));
}
}