← Back to Docs
TokenCut + LangChain
One callback handler to compress all prompts in your LangChain pipeline.
Installation
pip install agentready-sdk langchain-openaiBasic Usage
from langchain_openai import ChatOpenAI
from agentready.integrations.langchain import TokenCutCallbackHandler
# Create the handler (use your API key from env)
import os
handler = TokenCutCallbackHandler(api_key=os.environ["AGENTREADY_API_KEY"])
# Attach to any LangChain LLM
llm = ChatOpenAI(model="gpt-4", callbacks=[handler])
# All prompts are automatically compressed
response = llm.invoke("Your very long context here...")
# Check cumulative savings
print(handler.stats)
# {'total_tokens_saved': 3421, 'total_savings_usd': 0.1026}With RAG Chain
from langchain.chains import RetrievalQA
handler = TokenCutCallbackHandler(api_key=os.environ["AGENTREADY_API_KEY"], level="aggressive")
qa_chain = RetrievalQA.from_chain_type(
llm=ChatOpenAI(model="gpt-4", callbacks=[handler]),
retriever=vectorstore.as_retriever(),
)
# RAG context is compressed before hitting GPT-4
answer = qa_chain.invoke("What is the system architecture?")
print(f"Saved {handler.stats['total_tokens_saved']} tokens")Configuration
handler = TokenCutCallbackHandler(
api_key=os.environ["AGENTREADY_API_KEY"],
level="medium", # light, medium, aggressive
preserve_code=True, # keep code blocks intact
min_length=400, # skip short prompts (chars)
compress_system=False, # skip system messages
)Why Use This?
- Zero code changes: Just add a callback — no need to modify your chains
- RAG optimization: Compress retrieved context before LLM processing
- Cost tracking:
handler.statsgives cumulative savings - Safe: Falls back to original text if compression fails