I have written an invoice processing crewa and one of its tools is designed to one thing and that is to use a given prompt to extract certain fields from pdf files using gpt-4o-mini. For some reason my workflow fails to identify the selected_pdfs and here is part of log information:
2025-02-03 22:47:29,496 - invoice_processing - INFO - Selected PDFs at initialization: [‘~/90194464.pdf’, ‘~/90234640.pdf’, ‘~/90237804.pdf’]
2025-02-03 22:47:29,503 - invoice_processing - DEBUG - Initializing DataExtractorTool.
2025-02-03 22:47:29,539 - invoice_processing - DEBUG - ChatOpenAI model initialized successfully.
2025-02-03 22:47:29,539 - invoice_processing - DEBUG - FileManagerTool initialized.
2025-02-03 22:47:38,999 - invoice_processing - INFO - Starting data extraction process.
2025-02-03 22:47:38,999 - invoice_processing - WARNING - No PDF files provided for extraction.
Here is the tool called data_extractor_tool.py:
Configure logging
logger = logging.getLogger(“invoice_processing”)
logger.setLevel(logging.DEBUG) # Set to DEBUG to capture all levels of logs
Create handlers
console_handler = logging.StreamHandler(sys.stdout)
console_handler.setLevel(logging.DEBUG)
file_handler = logging.FileHandler(“data_extractor_tool_log.txt”)
file_handler.setLevel(logging.DEBUG)
Create formatter and add it to handlers
formatter = logging.Formatter(‘%(asctime)s - %(levelname)s - %(message)s’)
console_handler.setFormatter(formatter)
file_handler.setFormatter(formatter)
Add handlers to the logger
logger.addHandler(console_handler)
logger.addHandler(file_handler)
class DataExtractorToolSchema(BaseModel):
“”“Input for the Data Extractor tool.”“”
selected_pdfs: List[str] = Field(…, description=“List of paths to PDF files or text files containing invoice data.”)
prompt: str = Field(…, description=“The prompt to use for data extraction.”)
class DataExtractorTool(BaseTool):
name: str = “Extract Invoice Data”
description: str = "Extracts structured data from invoice text using an LLM. "
“Input should be a list of paths to PDF files or text files containing invoice data and the extraction prompt.”
args_schema: Type[BaseModel] = DataExtractorToolSchema
model_name: str = “gpt-4o-mini” # Default model
openai_api_key: Optional[str] = os.getenv(“OPENAI_API_KEY”)
llm: Optional[ChatOpenAI] = None
file_manager_tool: FileManagerTool = None
def __init__(self, model: Optional[str] = None, file_manager_tool: Optional[FileManagerTool] = None):
"""Initializes the Data Extractor with an LLM."""
super().__init__()
logger.debug("Initializing DataExtractorTool.")
if model:
self.model_name = model
logger.debug(f"Model name set to: {self.model_name}")
try:
self.llm = ChatOpenAI(model_name=self.model_name, openai_api_key=self.openai_api_key, temperature=0.7)
logger.debug("ChatOpenAI model initialized successfully.")
except Exception as e:
logger.error(f"Failed to initialize ChatOpenAI model: {e}", exc_info=True)
raise
self.file_manager_tool = file_manager_tool or FileManagerTool()
logger.debug("FileManagerTool initialized.")
def _run(self, selected_pdfs: List[str], prompt: str) -> List[Dict]:
"""Executes the tool to extract invoice data using an LLM."""
logger.info("Starting data extraction process.")
extracted_data_list = []
logger.warning(f"Selected PDFs: {selected_pdfs}")
logger.warning(f"Prompt: {prompt}")
if not selected_pdfs:
logger.warning("No PDF files provided for extraction.")
return extracted_data_list
for file_path in selected_pdfs:
logger.debug(f"Processing file: {file_path}")
try:
if not os.path.exists(file_path):
logger.error(f"File does not exist: {file_path}")
continue
if file_path.endswith('.pdf'):
logger.debug(f"Reading PDF file: {file_path}")
invoice_text = self.file_manager_tool.read_pdf(file_path)
elif file_path.endswith('.txt'):
logger.debug(f"Reading text file: {file_path}")
with open(file_path, 'r', encoding='utf-8') as f:
invoice_text = f.read()
else:
logger.error(f"Unsupported file type: {file_path}")
continue
if not invoice_text:
logger.warning(f"No content extracted from file: {file_path}")
continue
logger.info(f"Extracted invoice text from {file_path}: {invoice_text[:200]}...")
# Combine the prompt and invoice text to form the full instruction for the LLM
full_prompt = f"{prompt}\n\n{invoice_text}"
logger.debug(f"Full prompt for LLM: {full_prompt[:500]}...") # Log first 500 chars
try:
response = self.llm.invoke(full_prompt)
response_content = response.content # Access the content attribute
logger.info(f"Received response from LLM for {file_path}: {response_content[:200]}...")
except Exception as e:
logger.error(f"LLM invocation failed for {file_path}: {e}", exc_info=True)
continue
# Attempt to extract JSON using regex
json_match = re.search(r"\{.*}", response_content, re.DOTALL)
if json_match:
json_string = json_match.group(0).strip()
logger.debug(f"Extracted JSON string: {json_string}")
try:
extracted_data = json.loads(json_string)
logger.info(f"Parsed JSON data for {file_path}: {extracted_data}")
extracted_data_list.append(extracted_data)
except json.JSONDecodeError as e:
logger.error(f"JSON decoding failed for {file_path}: {e}", exc_info=True)
else:
logger.error(f"No JSON object found in LLM response for {file_path}.")
except Exception as e:
logger.error(f"Unexpected error processing {file_path}: {e}", exc_info=True)
logger.info("Data extraction process completed.")
return extracted_data_list