Spaces:

BillyZ1129
/

Billy_Space

Sleeping

App Files Files Community

BillyZ1129 commited on Apr 17

Commit

5b64068

verified ·

1 Parent(s): 94dcd23

Update app.py

Browse files

Files changed (1) hide show

app.py +417 -23

app.py CHANGED Viewed

@@ -1,30 +1,424 @@
-import streamlit as st
-from transformers import pipeline
-# Load the text classification model pipeline
-classifier = pipeline("text-classification",model='isom5240ust/bert-base-uncased-emotion', return_all_scores=True)
-# Streamlit application title
-st.title("Text Classification for you")
-st.write("Classification for 6 emotions: sadness, joy, love, anger, fear, surprise")
-# Text input for user to enter the text to classify
-text = st.text_area("Enter the text to classify", "")
-# Perform text classification when the user clicks the "Classify" button
-if st.button("Classify"):
-    # Perform text classification on the input text
-    results = classifier(text)[0]
-    # Display the classification result
-    max_score = float('-inf')
-    max_label = ''
-    for result in results:
-        if result['score'] > max_score:
-            max_score = result['score']
-            max_label = result['label']
-    st.write("Text:", text)
-    st.write("Label:", max_label)
-    st.write("Score:", max_score)

+from flask import Flask, render_template, request, jsonify
+from werkzeug.utils import secure_filename
+from openai import OpenAI
+from io import BytesIO
+import PyPDF2
+from pdfminer.high_level import extract_text
+from docx import Document
+import os
+import re
+import uuid
+from typing import Tuple
+import pdfplumber
+app = Flask(__name__)
+app.config['UPLOAD_FOLDER'] = '/home/billy1129/resume_optimizer/static/uploads'
+os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
+app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024  # 16MB限制
+# 初始化Azure OpenAI客户端
+client = OpenAI(
+    base_url="https://api.deepseek.com",
+    api_key="sk-bc73223a36d240758af12bf4a197a3be"
+)
+def safe_filename(filename: str) -> str:
+    """安全处理文件名，保留中文字符"""
+    filename = re.sub(r'[^\w\u4e00-\u9fff\-\.]', '', filename.strip())
+    name, ext = os.path.splitext(filename)
+    random_str = uuid.uuid4().hex[:6]
+    return f"{name}_{random_str}{ext}"
+def extract_text_from_pdf(file_stream: BytesIO) -> str:
+    """混合提取方案，增强去重处理（包括标点符号和括号）"""
+    def process_duplicates(text: str) -> str:
+        """处理各种重复字符（中文、英文、标点、括号等）"""
+        # 处理中文重复（包括全角标点）
+        text = re.sub(r'([\u4e00-\u9fff])\1+', r'\1', text)
+        # 处理常见标点符号重复（包括全角和半角）
+        text = re.sub(r'([，。、；："“”‘’\'\"\(\)\[\]\{\}\<\>])\1+', r'\1', text)
+        # 处理特殊重复模式（如"（（"变成"（"）
+        text = re.sub(r'(（)\1+', r'\1', text)
+        text = re.sub(r'(）)\1+', r'\1', text)
+        return text
+    try:
+        # 优先尝试pdfplumber
+        try:
+            file_stream.seek(0)
+            with pdfplumber.open(file_stream) as pdf:
+                text = "\n".join([
+                    page.extract_text(x_tolerance=2, y_tolerance=2)
+                    for page in pdf.pages
+                    if page.extract_text()
+                ])
+            print("========= pdfplumber 原始提取内容 =========")
+            print(text)
+            text = process_duplicates(text)
+            print("========= 去重处理后内容 =========")
+            print(text)
+            return text.strip()
+        except Exception as e:
+            print(f"pdfplumber提取失败，尝试PyPDF2: {str(e)}")
+            # 备用方案：PyPDF2+去重
+            file_stream.seek(0)
+            reader = PyPDF2.PdfReader(file_stream)
+            text = '\n'.join({
+                line.strip()
+                for page in reader.pages
+                for line in (page.extract_text() or "").split('\n')
+                if line.strip()
+            })
+            print("========= PyPDF2 原始提取内容 =========")
+            print(text)
+            text = process_duplicates(text)
+            print("========= 去重处理后内容 =========")
+            print(text)
+            return text
+    except Exception as e:
+        raise ValueError(f"PDF解析失败: {str(e)}")
+def extract_text_from_word(file_stream: BytesIO) -> str:
+    """从Word文档提取文本"""
+    try:
+        file_stream.seek(0)
+        doc = Document(file_stream)
+        return "\n".join([para.text for para in doc.paragraphs if para.text])
+    except Exception as e:
+        raise ValueError(f"Word解析失败: {str(e)}")
+def extract_text_from_file(file_stream: BytesIO, filename: str) -> str:
+    """从上传文件提取文本内容"""
+    if filename.lower().endswith('.pdf'):
+        return extract_text_from_pdf(file_stream)
+    elif filename.lower().endswith(('.doc', '.docx')):
+        return extract_text_from_word(file_stream)
+    elif filename.lower().endswith('.txt'):
+        file_stream.seek(0)
+        return file_stream.read().decode('utf-8', errors='ignore')
+    else:
+        raise ValueError("不支持的文件格式")
+def analyze_resume_with_ai(text: str, job_position: str = None) -> Tuple[list, int]:
+    """使用OpenAI分析简历文本并评分"""
+    MAX_TOKENS = 120000
+    if len(text) > MAX_TOKENS * 3.5:
+        return ["简历内容过长，请简化内容"], 0
+    # 根据岗位生成针对性提示
+    job_specific_prompt = ""
+    if job_position:
+        job_specific_prompt = f"""
+[岗位针对性分析]
+目标岗位: {job_position}
+请特别关注以下与目标岗位相关的评估维度:
+1. 专业技能匹配度: 检查简历中是否包含该岗位的核心技能关键词
+2. 项目经验相关性: 评估项目经验与目标岗位的匹配程度
+3. 行业术语使用: 检查是否使用了该岗位领域的专业术语
+4. 成就量化标准: 根据该岗位特点评估成就描述的量化程度
+"""
+    prompt = f"""请严格按照以下四部分分析简历,严格遵循格式:
+{job_specific_prompt if job_specific_prompt else ""}
+[总扣分]
+总扣分: XX分  # 必须单独一行明确写出总扣分值
+[扣分项]
+请列出简历的所有扣分项，每一项必须明确指出扣分项在简历中的位置，扣分数量，并给出具体改进建议，将扣分项和建议放在【缺点】中输出给用户。
+请严格遵循以下评分标准中的扣分规则，最后在第一行，计算总扣分量，格式为"总扣分: XX分"。
+[整体总结]
+用一段话来整体概括这篇简历的优缺点，特别关注与目标岗位的匹配度。
+[优点]
+• 优点1 (特别标注与目标岗位相关的优势)
+• 优点2
+[缺点]
+• 具体位置(简历第几行或哪个部分): 具体问题 (具体改进建议) (-X分)
+• 具体位置(简历第几行或哪个部分): 具体问题 (具体改进建议) (-X分)
+确保在[缺点]部分之后不输出任何其他内容
+评分标准：
+高质量简历评分标准（基于STAR法则和岗位匹配度）
+一、基础信息完整性（满分15分）
+必备信息要求：
+姓名
+联系方式（电话、邮箱）
+住址信息（至少提供省份或城市）
+评分细则：
+每项必备信息均完整且正确：得满分15分。
+缺失任一项：扣5分；如缺失两项及以上，累计扣分，但最低分为0分。
+二、内容结构与逻辑性（满分25分）
+结构要求：
+简历需清晰划分区域，如个人信息、教育经历、工作经历、技能、项目经验等。
+每一区块内容需符合逻辑，信息层次分明。
+评分细则：
+每个必备区域（至少5个区域）均明确标识并合理排序：每个区域得5分，区域缺失或模糊者扣5分。
+在每个区域内，要求描述具备逻辑性和条理性，出现明显逻辑混乱（如叙述前后矛盾或顺序混乱）者，每处扣2分，累计扣分不超过该区域分值。
+三、专业技能及关键词匹配（满分30分）
+匹配要求：
+简历中必须明确列出与目标职位直接相关的核心技能或关键词（建议不少于3项，最多5项计分）。
+评分细则：
+每列出一项与目标职位高度匹配的技能或关键词，得6分（最多计5项得分）。
+如未列出任何相关技能或关键词，直接扣30分。
+若关键词存在但与目标职位匹配度较低或描述不清晰，依据实际情况酌情扣分（每项扣分范围为2-4分）。
+【新增】岗位相关关键词匹配度额外评分：
+• 完全匹配目标岗位核心技能：每项+2分（最高+10分）
+• 部分匹配目标岗位次要技能：每项+1分（最高+5分）
+四、工作成就与项目描述（满分20分，必须遵循STAR法则）
+要求说明：
+每段工作经历或项目描述必须完整包含：
+Situation（情境）： 说明工作/项目背景与挑战。
+Task（任务）： 说明你在该情境下需要完成的任务。
+Action（行动）： 描述为解决问题所采取的具体措施。
+Result（结果）： 列出取得的成果和影响（最好附量化指标）。
+评分细则：
+每完整描述一项工作或项目经历且具备STAR所有要素：得5分，最多计4项得分。
+若工作或项目描述存在缺失或不清晰（例如缺少关键STAR元素），则每项扣2-5分（依据缺失程度和信息模糊程度）。
+如果简历完全没有相关描述，直接扣20分。
+【新增】岗位相关项目经验额外评分：
+• 高度相关项目：每项+3分（最高+9分）
+• 部分相关项目：每项+1分（最高+3分）
+五、语言表达及排版质量（满分10分）
+表达与排版要求：
+整体语言表达准确、专业，无明显错别字或语法错误。
+排版整洁、格式统一，避免混乱或信息堆砌。
+评分细则：
+排版格式符合要求，得5分；若出现明显格式错误或杂乱，每项错误扣1至5分，累计扣分最高5分。
+语言表达无错别字或语法错误，得5分；每出现一处错别字或语法错误扣1分（最多扣5分）。
+简历内容：
+{text[:30000]}{'...' if len(text) > 30000 else ''}"""
+    try:
+        response = client.chat.completions.create(
+            model="deepseek-chat",
+            messages=[
+                {"role": "system", "content": "你是一位严格的简历评估专家。你必须严格按照评分标准进行评分和扣分，并明确指出每个缺点在简历中的具体位置。总分为100分，最终分数 = 100 - 总扣分。"},
+                {"role": "user", "content": prompt}
+            ],
+            temperature=1,
+            stream=False
+        )
+        content = response.choices[0].message.content
+        # 输出原始 AI 响应以便调试
+        print("======== RAW AI RESPONSE ========")
+        print(content)
+        print("=================================")
+        # 解析响应内容
+        lines = [line.strip() for line in content.split('\n') if line.strip()]
+        suggestions = []
+        deduction_points = 0
+        current_section = None
+        # 首先查找显式的总扣分声明
+        total_deduction_match = None
+        for line in lines:
+            total_deduction_match = re.search(r'总扣分[:：]\s*(\d+)分', line)
+            if total_deduction_match:
+                deduction_points = int(total_deduction_match.group(1))
+                break
+        # 如果没有找到显式总扣分，则尝试从缺点部分累加
+        if total_deduction_match is None:
+            in_cons_section = False
+            for line in lines:
+                if re.match(r'^\[?缺点\]?', line, re.IGNORECASE):
+                    in_cons_section = True
+                    continue
+                if in_cons_section:
+                    deduction_match = re.search(r'\(-(\d+)分\)', line)
+                    if deduction_match:
+                        deduction_points += int(deduction_match.group(1))
+        # 确保扣分值在合理范围内
+        deduction_points = max(0, min(100, deduction_points))
+        # 计算最终分数
+        score = max(0, min(100, 100 - deduction_points))
+        current_section = None
+        # 更严格的章节检测
+        for line in lines:
+            # 检测章节标题
+            if re.match(r'^\[?整体总结\]?', line, re.IGNORECASE):
+                current_section = "summary"
+                suggestions.append(line)
+                continue
+            elif re.match(r'^\[?优点\]?', line, re.IGNORECASE):
+                current_section = "pros"
+                suggestions.append(line)
+                continue
+            elif re.match(r'^\[?缺点\]?', line, re.IGNORECASE):
+                current_section = "cons"
+                suggestions.append(line)
+                continue
+            elif re.match(r'^\[?扣分项\]?', line, re.IGNORECASE):
+                current_section = "deduction"
+                continue
+            # 只保留当前章节的内容
+            if current_section in ["summary", "pros", "cons"]:
+                suggestions.append(line)
+        return suggestions, score
+    except Exception as e:
+        print(f"AI分析错误: {str(e)}")
+        return [f"AI分析时发生错误: {str(e)}"], 0
+@app.route('/')
+def index():
+    return render_template('index.html')
+@app.route('/upload', methods=['POST'])
+def upload_file():
+    if 'resume' not in request.files:
+        return jsonify({'error': '请选择文件上传'}), 400
+    file = request.files['resume']
+    if file.filename == '':
+        return jsonify({'error': '未选择文件'}), 400
+    try:
+        # 从form获取job_position
+        job_position = request.form.get('job_position')
+        if not job_position:
+            return jsonify({'error': '请选择目标岗位'}), 400
+        filename = safe_filename(file.filename)
+        file_stream = BytesIO(file.read())
+        text = extract_text_from_file(file_stream, file.filename)
+        if not text.strip():
+            return jsonify({'error': '文件内容为空或无法解析'}), 400
+        suggestions, score = analyze_resume_with_ai(text, job_position)
+        return jsonify({
+            'message': '分析成功',
+            'suggestions': suggestions,
+            'score': score,
+            'filename': filename,
+            'job_position': job_position
+        })
+    except ValueError as e:
+        return jsonify({'error': str(e)}), 400
+    except Exception as e:
+        print(f"上传处理错误: {str(e)}")
+        return jsonify({'error': f'处理失败: {str(e)}'}), 500
+@app.route('/generate_cover_letter', methods=['POST'])
+def generate_cover_letter():
+    try:
+        data = request.json
+        required_fields = ['company_name', 'position', 'resume_text', 'job_description']
+        # 验证必填字段
+        for field in required_fields:
+            if not data.get(field):
+                return jsonify({'error': f'缺少必填字段: {field}'}), 400
+        # 构建AI提示词 - 优化版
+        prompt = f"""你是一位专业的职业顾问，需要根据申请人提供的简历内容撰写求职信。请严格遵守以下规则：
+1. 信息真实性原则：
+- 只能使用简历中明确列出的教育背景、工作经历、项目经验和技能
+- 绝对禁止添加、编造或推断简历中没有的信息
+- 如果某项要求(如特定技能)在简历中未体现，不要在求职信中提及
+2. 内容要求：
+[必须包含的格式要素]
+- 正式商务信函格式(日期、称呼、正文、结尾敬语)
+- 称呼使用"尊敬的招聘经理"（如不知道具体姓名）
+- 结尾要有明确的行动号召(如期待面试机会)
+[内容结构]
+第一段：明确申请职位和动机(30-50字)
+第二段：从简历中提取与职位最相关的2-3个核心优势(80-120字)
+第三段：结合公司文化和职位要求的具体匹配点(80-120字)
+第四段：礼貌结尾和行动号召(30-50字)
+3. 写作规范：
+- 语言简洁专业，总字数严格控制在300-400字
+- 使用主动语态和积极措辞
+- 量化成果时只能使用简历中提供的数据
+- 避免使用夸张或主观的描述词
+4. 特别注意：
+- 如果简历中没有公司要求的关键技能或经验，不要在信中编造
+- 不要假设任何简历中没有的工作职责或成就
+- 不要添加简历中未列出的证书、奖项或培训经历
+[申请人简历内容]
+{data['resume_text'][:10000]}
+[目标公司信息]
+公司名称: {data['company_name']}
+公司介绍: {data.get('company_info', '未提供')}
+[申请职位]
+{data['position']}
+[职位描述及要求]
+{data['job_description']}
+[申请动机]
+{data.get('motivation', '未提供')}
+请现在开始撰写求职信，严格遵循以上所有要求。"""
+        # 调用AI生成推荐信
+        response = client.chat.completions.create(
+            model="deepseek-chat",
+            messages=[
+                {
+                    "role": "system",
+                    "content": """你是一位严谨的职业顾问，专门帮助求职者撰写基于事实的求职信。
+                    你必须：
+                    1. 只使用申请人简历中明确提供的信息
+                    2. 绝不添加、推断或编造任何简历中没有的内容
+                    3. 如果简历缺少职位要求的关键资质，如实呈现而不虚构
+                    4. 所有成就描述必须有简历中的具体数据支持"""
+                },
+                {"role": "user", "content": prompt}
+            ],
+            temperature=0.5,  # 降低创造性，提高准确性
+            max_tokens=2000
+        )
+        content = response.choices[0].message.content
+        # 后处理检查
+        if "简历中未提及" in content or "根据我的了解" in content:
+            raise ValueError("AI尝试添加简历外信息")
+        return jsonify({
+            'success': True,
+            'cover_letter': content,
+            'word_count': len(content.split())
+        })
+    except Exception as e:
+        print(f"推荐信生成错误: {str(e)}")
+        return jsonify({'error': f'生成失败: {str(e)}'}), 500
+if __name__ == '__main__':
+    app.run(debug=True)