Hello,
I see knowledge source for string on crewAI documentation page but I am looking for help on PDF as knowledge source. Also, is it possible to to keep PDF document at some cloud storage location and crew can access it as knowledge source?
It seems the contents of the pdf are not in the my_pdf_source object. What other steps need to be carried out ? Do we have to do something with the memory ?
There is a message [2024-12-17 13:02:36][ERROR]: Failed to upsert documents: Expected metadata to be a non-empty dict, got 0 metadata attributes in upsert.
[2024-12-17 13:02:36][WARNING]: Failed to init knowledge: Expected metadata to be a non-empty dict, got 0 metadata attributes in upsert.
import fitz
def pdf_to_text(pdf_path, txt_path):
# Open the PDF
pdf_document = fitz.open(pdf_path)
# Create a text file to store the extracted text
with open(txt_path, "w", encoding="utf-8") as text_file:
for page_number in range(len(pdf_document)):
page = pdf_document.load_page(page_number)
text = page.get_text()
text_file.write(text)
# Close the PDF
pdf_document.close()
# Example usage
pdf_path = "/home/user/crewai/demo/knowledge/filename.pdf"
txt_path = "/home/user/crewai/demo/knowledge/filename.txt"
pdf_to_text(pdf_path, txt_path)
And here is my crew.py
from crewai import Agent, Crew, Process, Task, LLM
from crewai.project import CrewBase, agent, crew, task, before_kickoff, after_kickoff
# Knowledge temporary fix
from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
from pydantic import Field
from typing import Dict
import uuid
class LocalTxTFileKnowledgeSource(BaseKnowledgeSource):
file_path: str = Field(description="Path to the local .txt file")
def load_content(self) -> Dict[str, str]:
try:
with open(self.file_path, "r", encoding="utf-8") as file:
content = file.read()
return {self.file_path: content}
except Exception as e:
raise ValueError(f"Failed to read the file {self.file_path}: {str(e)}")
def add(self) -> None:
"""Process and store the file content."""
content = self.load_content()
for _, text in content.items():
chunks = self._chunk_text(text)
self.chunks.extend(chunks)
chunks_metadata = [
{
"chunk_id": str(uuid.uuid4()),
"source": self.file_path,
"description": f"Chunk {i + 1} from file {self.file_path}"
}
for i in range(len(chunks))
]
self.save_documents(metadata=chunks_metadata)
@CrewBase
class Demo2():
"""Demo2 crew"""
agents_config = 'config/agents.yaml'
tasks_config = 'config/tasks.yaml'
@agent
def reviewer(self) -> Agent:
return Agent(
config=self.agents_config['reviewer'],
memory=True,
verbose=True,
max_rpm=10, # Limit API calls
)
@task
def documentation_review_task(self) -> Task:
return Task(
config=self.tasks_config['documentation_review_task'],
output_file='outputs/1_documentation_review_task.md'
)
@crew
def crew(self) -> Crew:
"""Creates the Demo2 crew"""
local_txt_source = LocalTxTFileKnowledgeSource(file_path="knowledge/filename.txt", metadata={"version": "15.1"})
return Crew(
agents=self.agents, # Automatically created by the @agent decorator
tasks=self.tasks, # Automatically created by the @task decorator
process=Process.sequential,
verbose=True,
knowledge_sources=[local_txt_source],
full_output=True,
output_log_file='outputs/0_crew_output_log_file.md'
)
Hi, as a follow-up on this question, is there a way to set the knowledge search as a directory of PDFs for RAG, instead of just a single PDF as in this example?
Knowledge in CrewAI is a powerful system that allows AI agents to access and utilize external information sources during their tasks. Think of it as giving your agents a reference library they can consult while working.
If i use the “LocalTxTFileKnowledgeSource” or “PDFKnowledgeSource” I got this error.
[2024-12-20 16:05:31][ERROR]: Failed to upsert documents: timed out in upsert.
[2024-12-20 16:05:31][WARNING]: Failed to init knowledge: timed out in upsert.
Here’s how to implement PDF as a knowledge source.
Make sure you have a folder named knowledge in the root of your directory where you should save your PDF(s).
# Imports
from crewai import Agent, Task, Crew, Process, LLM
from crewai.knowledge.source.pdf_knowledge_source import PDFKnowledgeSource
# Pass the PDF to the knowledge class
# IMPORTANT: the file path should be the name of the pdf only and not like this `knowledge/pdf_name.pdf
pdf_source = PDFKnowledgeSource(file_path="pdf_name.pdf")
...
my_crew = Crew(
...,
knowledge_sources=[my_pdf_source],
)
If you get the metadata error (this will be resolved when we cut v0.86.1), add a dummy input to the metadata like so: