Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	修复newbing引用样式
Browse files- docs/test_markdown_format.py +130 -0
- request_llm/bridge_newbing.py +2 -2
- toolbox.py +7 -1
    	
        docs/test_markdown_format.py
    ADDED
    
    | @@ -0,0 +1,130 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            sample = """
         | 
| 2 | 
            +
            [1]: https://baike.baidu.com/item/%E8%B4%A8%E8%83%BD%E6%96%B9%E7%A8%8B/1884527 "质能方程(质能方程式)_百度百科"
         | 
| 3 | 
            +
            [2]: https://www.zhihu.com/question/348249281 "如何理解质能方程 E=mc²? - 知乎"
         | 
| 4 | 
            +
            [3]: https://zhuanlan.zhihu.com/p/32597385 "质能方程的推导与理解 - 知乎 - 知乎专栏"
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            你好,这是必应。质能方程是描述质量与能量之间的当量关系的方程[^1^][1]。用tex格式,质能方程可以写成$$E=mc^2$$,其中$E$是能量,$m$是质量,$c$是光速[^2^][2] [^3^][3]。
         | 
| 7 | 
            +
            """
         | 
| 8 | 
            +
            import re
         | 
| 9 | 
            +
             | 
| 10 | 
            +
            def preprocess_newbing_out(s):
         | 
| 11 | 
            +
                pattern = r'\^(\d+)\^' # 匹配^数字^
         | 
| 12 | 
            +
                pattern2 = r'\[(\d+)\]' # 匹配^数字^
         | 
| 13 | 
            +
                sub = lambda m: '\['+m.group(1)+'\]' # 将匹配到的数字作为替换值
         | 
| 14 | 
            +
                result = re.sub(pattern, sub, s) # 替换操作
         | 
| 15 | 
            +
                if '[1]' in result:
         | 
| 16 | 
            +
                    result += '<br/><hr style="border-top: dotted 1px #44ac5c;"><br/><small>' + "<br/>".join([re.sub(pattern2, sub, r) for r in result.split('\n') if r.startswith('[')]) + '</small>'
         | 
| 17 | 
            +
                return result
         | 
| 18 | 
            +
             | 
| 19 | 
            +
             | 
| 20 | 
            +
            def close_up_code_segment_during_stream(gpt_reply):
         | 
| 21 | 
            +
                """
         | 
| 22 | 
            +
                在gpt输出代码的中途(输出了前面的```,但还没输出完后面的```),补上后面的```
         | 
| 23 | 
            +
             | 
| 24 | 
            +
                Args:
         | 
| 25 | 
            +
                    gpt_reply (str): GPT模型返回的回复字符串。
         | 
| 26 | 
            +
             | 
| 27 | 
            +
                Returns:
         | 
| 28 | 
            +
                    str: 返回一个新的字符串,将输出代码片段的“后面的```”补上。
         | 
| 29 | 
            +
             | 
| 30 | 
            +
                """
         | 
| 31 | 
            +
                if '```' not in gpt_reply:
         | 
| 32 | 
            +
                    return gpt_reply
         | 
| 33 | 
            +
                if gpt_reply.endswith('```'):
         | 
| 34 | 
            +
                    return gpt_reply
         | 
| 35 | 
            +
             | 
| 36 | 
            +
                # 排除了以上两个情况,我们
         | 
| 37 | 
            +
                segments = gpt_reply.split('```')
         | 
| 38 | 
            +
                n_mark = len(segments) - 1
         | 
| 39 | 
            +
                if n_mark % 2 == 1:
         | 
| 40 | 
            +
                    # print('输出代码片段中!')
         | 
| 41 | 
            +
                    return gpt_reply+'\n```'
         | 
| 42 | 
            +
                else:
         | 
| 43 | 
            +
                    return gpt_reply
         | 
| 44 | 
            +
                
         | 
| 45 | 
            +
            import markdown
         | 
| 46 | 
            +
            from latex2mathml.converter import convert as tex2mathml
         | 
| 47 | 
            +
            from functools import wraps, lru_cache
         | 
| 48 | 
            +
            def markdown_convertion(txt):
         | 
| 49 | 
            +
                """
         | 
| 50 | 
            +
                将Markdown格式的文本转换为HTML格式。如果包含数学公式,则先将公式转换为HTML格式。
         | 
| 51 | 
            +
                """
         | 
| 52 | 
            +
                pre = '<div class="markdown-body">'
         | 
| 53 | 
            +
                suf = '</div>'
         | 
| 54 | 
            +
                if txt.startswith(pre) and txt.endswith(suf):
         | 
| 55 | 
            +
                    # print('警告,输入了已经经过转化的字符串,二次转化可能出问题')
         | 
| 56 | 
            +
                    return txt # 已经被转化过,不需要再次转化
         | 
| 57 | 
            +
                
         | 
| 58 | 
            +
                markdown_extension_configs = {
         | 
| 59 | 
            +
                    'mdx_math': {
         | 
| 60 | 
            +
                        'enable_dollar_delimiter': True,
         | 
| 61 | 
            +
                        'use_gitlab_delimiters': False,
         | 
| 62 | 
            +
                    },
         | 
| 63 | 
            +
                }
         | 
| 64 | 
            +
                find_equation_pattern = r'<script type="math/tex(?:.*?)>(.*?)</script>'
         | 
| 65 | 
            +
             | 
| 66 | 
            +
                def tex2mathml_catch_exception(content, *args, **kwargs):
         | 
| 67 | 
            +
                    try:
         | 
| 68 | 
            +
                        content = tex2mathml(content, *args, **kwargs)
         | 
| 69 | 
            +
                    except:
         | 
| 70 | 
            +
                        content = content
         | 
| 71 | 
            +
                    return content
         | 
| 72 | 
            +
             | 
| 73 | 
            +
                def replace_math_no_render(match):
         | 
| 74 | 
            +
                    content = match.group(1)
         | 
| 75 | 
            +
                    if 'mode=display' in match.group(0):
         | 
| 76 | 
            +
                        content = content.replace('\n', '</br>')
         | 
| 77 | 
            +
                        return f"<font color=\"#00FF00\">$$</font><font color=\"#FF00FF\">{content}</font><font color=\"#00FF00\">$$</font>"
         | 
| 78 | 
            +
                    else:
         | 
| 79 | 
            +
                        return f"<font color=\"#00FF00\">$</font><font color=\"#FF00FF\">{content}</font><font color=\"#00FF00\">$</font>"
         | 
| 80 | 
            +
             | 
| 81 | 
            +
                def replace_math_render(match):
         | 
| 82 | 
            +
                    content = match.group(1)
         | 
| 83 | 
            +
                    if 'mode=display' in match.group(0):
         | 
| 84 | 
            +
                        if '\\begin{aligned}' in content:
         | 
| 85 | 
            +
                            content = content.replace('\\begin{aligned}', '\\begin{array}')
         | 
| 86 | 
            +
                            content = content.replace('\\end{aligned}', '\\end{array}')
         | 
| 87 | 
            +
                            content = content.replace('&', ' ')
         | 
| 88 | 
            +
                        content = tex2mathml_catch_exception(content, display="block")
         | 
| 89 | 
            +
                        return content
         | 
| 90 | 
            +
                    else:
         | 
| 91 | 
            +
                        return tex2mathml_catch_exception(content)
         | 
| 92 | 
            +
             | 
| 93 | 
            +
                def markdown_bug_hunt(content):
         | 
| 94 | 
            +
                    """
         | 
| 95 | 
            +
                    解决一个mdx_math的bug(单$包裹begin命令时多余<script>)
         | 
| 96 | 
            +
                    """
         | 
| 97 | 
            +
                    content = content.replace('<script type="math/tex">\n<script type="math/tex; mode=display">', '<script type="math/tex; mode=display">')
         | 
| 98 | 
            +
                    content = content.replace('</script>\n</script>', '</script>')
         | 
| 99 | 
            +
                    return content
         | 
| 100 | 
            +
             | 
| 101 | 
            +
             | 
| 102 | 
            +
                if ('$' in txt) and ('```' not in txt):  # 有$标识的公式符号,且没有代码段```的标识
         | 
| 103 | 
            +
                    # convert everything to html format
         | 
| 104 | 
            +
                    split = markdown.markdown(text='---')
         | 
| 105 | 
            +
                    convert_stage_1 = markdown.markdown(text=txt, extensions=['mdx_math', 'fenced_code', 'tables', 'sane_lists'], extension_configs=markdown_extension_configs)
         | 
| 106 | 
            +
                    convert_stage_1 = markdown_bug_hunt(convert_stage_1)
         | 
| 107 | 
            +
                    # re.DOTALL: Make the '.' special character match any character at all, including a newline; without this flag, '.' will match anything except a newline. Corresponds to the inline flag (?s).
         | 
| 108 | 
            +
                    # 1. convert to easy-to-copy tex (do not render math)
         | 
| 109 | 
            +
                    convert_stage_2_1, n = re.subn(find_equation_pattern, replace_math_no_render, convert_stage_1, flags=re.DOTALL)
         | 
| 110 | 
            +
                    # 2. convert to rendered equation
         | 
| 111 | 
            +
                    convert_stage_2_2, n = re.subn(find_equation_pattern, replace_math_render, convert_stage_1, flags=re.DOTALL)
         | 
| 112 | 
            +
                    # cat them together
         | 
| 113 | 
            +
                    return pre + convert_stage_2_1 + f'{split}' + convert_stage_2_2 + suf
         | 
| 114 | 
            +
                else:
         | 
| 115 | 
            +
                    return pre + markdown.markdown(txt, extensions=['fenced_code', 'codehilite', 'tables', 'sane_lists']) + suf
         | 
| 116 | 
            +
             | 
| 117 | 
            +
             | 
| 118 | 
            +
            sample = preprocess_newbing_out(sample)
         | 
| 119 | 
            +
            sample = close_up_code_segment_during_stream(sample)
         | 
| 120 | 
            +
            sample = markdown_convertion(sample)
         | 
| 121 | 
            +
            with open('tmp.html', 'w', encoding='utf8') as f:
         | 
| 122 | 
            +
                f.write("""
         | 
| 123 | 
            +
             | 
| 124 | 
            +
            <head>
         | 
| 125 | 
            +
                <title>My Website</title>
         | 
| 126 | 
            +
                <link rel="stylesheet" type="text/css" href="style.css">
         | 
| 127 | 
            +
            </head>
         | 
| 128 | 
            +
             | 
| 129 | 
            +
                """)
         | 
| 130 | 
            +
                f.write(sample)
         | 
    	
        request_llm/bridge_newbing.py
    CHANGED
    
    | @@ -27,12 +27,12 @@ def preprocess_newbing_out(s): | |
| 27 | 
             
                sub = lambda m: '\['+m.group(1)+'\]' # 将匹配到的数字作为替换值
         | 
| 28 | 
             
                result = re.sub(pattern, sub, s) # 替换操作
         | 
| 29 | 
             
                if '[1]' in result:
         | 
| 30 | 
            -
                    result += '\n\n | 
| 31 | 
             
                return result
         | 
| 32 |  | 
| 33 | 
             
            def preprocess_newbing_out_simple(result):
         | 
| 34 | 
             
                if '[1]' in result:
         | 
| 35 | 
            -
                    result += '\n\n | 
| 36 | 
             
                return result
         | 
| 37 |  | 
| 38 | 
             
            class NewBingHandle(Process):
         | 
|  | |
| 27 | 
             
                sub = lambda m: '\['+m.group(1)+'\]' # 将匹配到的数字作为替换值
         | 
| 28 | 
             
                result = re.sub(pattern, sub, s) # 替换操作
         | 
| 29 | 
             
                if '[1]' in result:
         | 
| 30 | 
            +
                    result += '\n\n```reference\n' + "\n".join([r for r in result.split('\n') if r.startswith('[')]) + '\n```\n'
         | 
| 31 | 
             
                return result
         | 
| 32 |  | 
| 33 | 
             
            def preprocess_newbing_out_simple(result):
         | 
| 34 | 
             
                if '[1]' in result:
         | 
| 35 | 
            +
                    result += '\n\n```reference\n' + "\n".join([r for r in result.split('\n') if r.startswith('[')]) + '\n```\n'
         | 
| 36 | 
             
                return result
         | 
| 37 |  | 
| 38 | 
             
            class NewBingHandle(Process):
         | 
    	
        toolbox.py
    CHANGED
    
    | @@ -271,8 +271,14 @@ def markdown_convertion(txt): | |
| 271 | 
             
                    content = content.replace('</script>\n</script>', '</script>')
         | 
| 272 | 
             
                    return content
         | 
| 273 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 274 |  | 
| 275 | 
            -
                if ('$' in txt) and ( | 
| 276 | 
             
                    # convert everything to html format
         | 
| 277 | 
             
                    split = markdown.markdown(text='---')
         | 
| 278 | 
             
                    convert_stage_1 = markdown.markdown(text=txt, extensions=['mdx_math', 'fenced_code', 'tables', 'sane_lists'], extension_configs=markdown_extension_configs)
         | 
|  | |
| 271 | 
             
                    content = content.replace('</script>\n</script>', '</script>')
         | 
| 272 | 
             
                    return content
         | 
| 273 |  | 
| 274 | 
            +
                def no_code(txt):
         | 
| 275 | 
            +
                    if '```' not in txt: 
         | 
| 276 | 
            +
                        return True
         | 
| 277 | 
            +
                    else:
         | 
| 278 | 
            +
                        if '```reference' in txt: return True    # newbing
         | 
| 279 | 
            +
                        else: return False
         | 
| 280 |  | 
| 281 | 
            +
                if ('$' in txt) and no_code(txt):  # 有$标识的公式符号,且没有代码段```的标识
         | 
| 282 | 
             
                    # convert everything to html format
         | 
| 283 | 
             
                    split = markdown.markdown(text='---')
         | 
| 284 | 
             
                    convert_stage_1 = markdown.markdown(text=txt, extensions=['mdx_math', 'fenced_code', 'tables', 'sane_lists'], extension_configs=markdown_extension_configs)
         |