Automating Price Quote Processing: Writing Supplier Prices to Excel
Japanese version:
Introduction
Previously, I introduced a Python script that sends orders and quote requests via email using an order form stored on our laboratory's network drive. You can find more information about this in my earlier post:
Today, I'll present a Python script that processes reply emails from suppliers containing price quotes. This script extracts pricing information and other relevant data, then writes it to an Excel file.
Script Overview
The script performs the following key functions:
Email Retrieval and Analysis:
Connects to a specified mailbox using IMAP.
Searches for emails with the subject "Quote Request" received within the last week.
Extracts the email body and sender information.
Data Extraction:
Utilizes the OpenAI GPT-4o-mini API to extract crucial information from the email body, including product name, company name, product code, list price, and quoted price.
Structures the extracted data for further processing.
Supplier Information Mapping:
Uses a predefined mapping of suppliers and their email addresses to identify the corresponding supplier for each email.
Excel File Output:
Converts the extracted data into a pandas DataFrame.
Appends new data to an existing Excel file, writing each item to specified columns.
Environment Setup
To set up the environment, install the necessary packages using conda:
conda install -c conda-forge pandas openpyxl openai python-dotenv
Script
import imaplib
import email
from email.header import decode_header
import os
import datetime
import pandas as pd
from openai import OpenAI
from openai import BadRequestError
from email.utils import parseaddr
from pathlib import Path
import re
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()
# Get sensitive information from environment variables
username = os.getenv('EMAIL_USERNAME')
password = os.getenv('EMAIL_PASSWORD')
imap_url = os.getenv('IMAP_URL')
# Check the operating system
if os.name == 'nt': # For Windows
excel_file_path = Path(os.getenv('EXCEL_FILE_PATH_WINDOWS'))
elif os.name == 'posix': # For macOS
excel_file_path = Path(os.getenv('EXCEL_FILE_PATH_MAC'))
else:
raise EnvironmentError("Unsupported Operating System")
# OpenAI API configuration
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
# Mapping of agencies and their email addresses
agency_mapping = {
'Agency A': {'email1@example.com', 'email2@example.com'},
'Agency B': {'email3@example.com'},
'Agency C': {'email4@example.com'},
'Agency D': {'email5@example.com'},
'Agency E': {'email6@example.com', 'email7@example.com'},
}
def decode_mime_words(s):
return u''.join(
word.decode(encoding or 'utf-8') if isinstance(word, bytes) else word
for word, encoding in decode_header(s))
# Function to retrieve data from emails
def get_mail_data(username, password, imap_url, retries=50):
attempt = 0
while attempt < retries:
try:
mail = imaplib.IMAP4_SSL(imap_url, 993)
mail.login(username, password)
mail.select("INBOX")
# Calculate the date for one week ago
one_week_ago = (datetime.datetime.now() - datetime.timedelta(days=7)).strftime("%d-%b-%Y")
# Search and retrieve emails from the last week
status, messages = mail.search(None, f'SINCE "{one_week_ago}"')
messages = messages[0].split(b' ')
extracted_data = []
if not messages or messages == [b'']:
print("No matching emails found.")
mail.logout()
return extracted_data
for mail_id in messages:
res, msg = mail.fetch(mail_id, '(RFC822)')
for response in msg:
if isinstance(response, tuple):
msg = email.message_from_bytes(response[1])
subject = decode_mime_words(msg.get("Subject"))
if 'Quote Request' in subject:
from_address = decode_mime_words(msg.get("From"))
from_address = parseaddr(from_address)[1]
print("From address:", from_address)
body = ""
if msg.is_multipart():
for part in msg.walk():
if part.get_content_type() == "text/plain":
body = part.get_payload(decode=True)
charset = part.get_content_charset()
body = body.decode(charset, errors='replace') if charset else body.decode('utf-8', errors='replace')
else:
body = msg.get_payload(decode=True)
charset = msg.get_content_charset()
body = body.decode(charset, errors='replace') if charset else body.decode('utf-8', errors='replace')
body = body.replace("\r\n", " ")
agency = next((key for key, val in agency_mapping.items() if from_address in val), "Unknown")
extracted_data.append((body, agency))
mail.logout()
return extracted_data
except imaplib.IMAP4.error as e:
print(f"Login attempt {attempt + 1} failed: {e}")
attempt += 1
print("All login attempts failed.")
return []
# Function to extract data using OpenAI API
def extract_data_with_gpt(text_data, agency_mapping):
parsed_data = []
print("Input Data:", text_data)
for text, agency in text_data:
text = str(text)
text = text.encode('utf-8', errors='ignore').decode('utf-8')
text = re.sub(r'[^\x20-\x7Eぁ-んァ-ン一-龥ー]', ' ', text)
if len(text) > 5000:
print("Text too long, truncating")
text = text[:5000]
print("Sending text to API:", text[:500])
try:
chat_completion = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": "Extract the product name, company, product code, list price, and delivery price from the following text. For list price and delivery price, include only the numerical price information. Finally, format the extracted information as a semicolon-separated format suitable for a Pandas DataFrame. The DataFrame structure should be 'Product Name;Company Name;Product Code;List Price;Delivery Price', with this header in the first row. Include only the semicolon-separated data in your response."},
{"role": "user", "content": text}
]
)
print("API Response:", chat_completion)
extracted_info = chat_completion.choices[0].message.content
print("Extracted info:", extracted_info)
if ';' in extracted_info:
data_rows = [item.split(';') for item in extracted_info.split("\n") if item.strip() and "Product Name" not in item]
else:
product_name = [line.split(': ')[1] for line in extracted_info.split("\n") if line.startswith('Product Name')]
company_name = [line.split(': ')[1] for line in extracted_info.split("\n") if line.startswith('Company Name')]
product_code = [line.split(': ')[1] for line in extracted_info.split("\n") if line.startswith('Product Code')]
list_price = [line.split(': ')[1] for line in extracted_info.split("\n") if line.startswith('List Price')]
delivery_price = [line.split(': ')[1] for line in extracted_info.split("\n") if line.startswith('Delivery Price')]
data_rows = list(zip(product_name, company_name, product_code, list_price, delivery_price))
print("Data rows before adding agency:", data_rows)
for row in data_rows:
row.append(agency)
print("Data rows after adding agency:", data_rows)
df = pd.DataFrame(data_rows, columns=["Product Name", "Company Name", "Product Code", "List Price", "Delivery Price", "Agency"])
parsed_data.append(df)
except BadRequestError as e:
print("Error with API request:", e)
return pd.concat(parsed_data, ignore_index=True) if parsed_data else pd.DataFrame()
def write_to_excel(df, excel_file_path):
with pd.ExcelWriter(excel_file_path, engine='openpyxl', mode='a', if_sheet_exists='overlay') as writer:
if 'Sheet1' in writer.book.sheetnames:
start_row = writer.book['Sheet1'].max_row
else:
start_row = 0
column_positions = {"Product Name": 0, "Company Name": 1, "Product Code": 2, "List Price": 4, "Delivery Price": 5, "Agency": 8}
for column in df.columns:
col_idx = column_positions.get(column, 0)
df[column].to_excel(writer, sheet_name='Sheet1', startcol=col_idx, startrow=start_row, index=False, header=False)
# Main process
if __name__ == "__main__":
mail_data = get_mail_data(username, password, imap_url)
df = extract_data_with_gpt(mail_data, agency_mapping)
print(df)
write_to_excel(df, excel_file_path)
.env
# Mail server settings
EMAIL_USERNAME=your_email@example.com
EMAIL_PASSWORD=your_email_password
IMAP_URL=imap.example.com
# OpenAI API
OPENAI_API_KEY=your_openai_api_key
# Excel file path
EXCEL_FILE_PATH_WINDOWS=C:\path\to\your\excel\file.xlsx
EXCEL_FILE_PATH_MAC=/path/to/your/excel/file.xlsx
Conclusion
This automation tool enables efficient databasing of quoted prices from various suppliers. After processing, emails are moved from the INBOX to a folder named "DONE" for organizational purposes.
The goal is to allocate more time for educational and research activities by streamlining administrative tasks. I hope this contributes to a more productive academic environment.
I have implemented various other automation processes, which I plan to introduce in future posts.
Your feedback, questions, and suggestions for improvement are welcome and appreciated. Please feel free to leave a comment below.