Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	๐ docstrings
Browse filesSigned-off-by: peter szemraj <[email protected]>
    	
        app.py
    CHANGED
    
    | @@ -1,3 +1,9 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 1 | 
             
            import contextlib
         | 
| 2 | 
             
            import logging
         | 
| 3 | 
             
            import os
         | 
| @@ -19,7 +25,6 @@ import gradio as gr | |
| 19 | 
             
            import nltk
         | 
| 20 | 
             
            import torch
         | 
| 21 | 
             
            from cleantext import clean
         | 
| 22 | 
            -
            from doctr.io import DocumentFile
         | 
| 23 | 
             
            from doctr.models import ocr_predictor
         | 
| 24 |  | 
| 25 | 
             
            from pdf2text import convert_PDF_to_Text
         | 
| @@ -28,7 +33,7 @@ from utils import load_example_filenames, saves_summary, truncate_word_count | |
| 28 |  | 
| 29 | 
             
            _here = Path(__file__).parent
         | 
| 30 |  | 
| 31 | 
            -
            nltk.download("stopwords" | 
| 32 |  | 
| 33 |  | 
| 34 | 
             
            MODEL_OPTIONS = [
         | 
| @@ -37,7 +42,7 @@ MODEL_OPTIONS = [ | |
| 37 | 
             
                "pszemraj/long-t5-tglobal-base-sci-simplify-elife",
         | 
| 38 | 
             
                "pszemraj/long-t5-tglobal-base-16384-booksci-summary-v1",
         | 
| 39 | 
             
                "pszemraj/pegasus-x-large-book-summary",
         | 
| 40 | 
            -
            ]
         | 
| 41 |  | 
| 42 |  | 
| 43 | 
             
            def predict(
         | 
| @@ -46,8 +51,16 @@ def predict( | |
| 46 | 
             
                token_batch_length: int = 1024,
         | 
| 47 | 
             
                empty_cache: bool = True,
         | 
| 48 | 
             
                **settings,
         | 
| 49 | 
            -
            ):
         | 
| 50 | 
            -
                """ | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 51 | 
             
                if torch.cuda.is_available() and empty_cache:
         | 
| 52 | 
             
                    torch.cuda.empty_cache()
         | 
| 53 |  | 
| @@ -143,9 +156,11 @@ def proc_submission( | |
| 143 | 
             
                    token_batch_length=token_batch_length,
         | 
| 144 | 
             
                    **settings,
         | 
| 145 | 
             
                )
         | 
| 146 | 
            -
                sum_text = [ | 
|  | |
|  | |
| 147 | 
             
                sum_scores = [
         | 
| 148 | 
            -
                    f" -  | 
| 149 | 
             
                    for i, s in enumerate(_summaries)
         | 
| 150 | 
             
                ]
         | 
| 151 |  | 
| @@ -153,9 +168,9 @@ def proc_submission( | |
| 153 | 
             
                history["Summary Scores"] = "<br><br>"
         | 
| 154 | 
             
                scores_out = "\n".join(sum_scores)
         | 
| 155 | 
             
                rt = round((time.perf_counter() - st) / 60, 2)
         | 
| 156 | 
            -
                 | 
| 157 | 
             
                html = ""
         | 
| 158 | 
            -
                html += f"<p>Runtime: {rt} minutes  | 
| 159 | 
             
                if msg is not None:
         | 
| 160 | 
             
                    html += msg
         | 
| 161 |  | 
| @@ -170,11 +185,13 @@ def proc_submission( | |
| 170 | 
             
            def load_single_example_text(
         | 
| 171 | 
             
                example_path: str or Path,
         | 
| 172 | 
             
                max_pages=20,
         | 
| 173 | 
            -
            ):
         | 
| 174 | 
             
                """
         | 
| 175 | 
            -
                 | 
| 176 | 
            -
             | 
| 177 | 
            -
             | 
|  | |
|  | |
| 178 | 
             
                """
         | 
| 179 | 
             
                global name_to_path
         | 
| 180 | 
             
                full_ex_path = name_to_path[example_path]
         | 
| @@ -198,30 +215,27 @@ def load_single_example_text( | |
| 198 | 
             
                return text
         | 
| 199 |  | 
| 200 |  | 
| 201 | 
            -
            def load_uploaded_file(file_obj, max_pages=20):
         | 
| 202 | 
             
                """
         | 
| 203 | 
            -
                load_uploaded_file -  | 
| 204 | 
            -
             | 
| 205 | 
            -
                Args:
         | 
| 206 | 
            -
                    file_obj (POTENTIALLY list): Gradio file object inside a list
         | 
| 207 |  | 
| 208 | 
            -
                 | 
| 209 | 
            -
             | 
|  | |
|  | |
| 210 | 
             
                """
         | 
| 211 | 
            -
             | 
| 212 | 
            -
                # file_path = Path(file_obj[0].name)
         | 
| 213 | 
            -
             | 
| 214 | 
             
                # check if mysterious file object is a list
         | 
| 215 | 
             
                if isinstance(file_obj, list):
         | 
| 216 | 
             
                    file_obj = file_obj[0]
         | 
| 217 | 
             
                file_path = Path(file_obj.name)
         | 
| 218 | 
             
                try:
         | 
|  | |
| 219 | 
             
                    if file_path.suffix == ".txt":
         | 
| 220 | 
             
                        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
         | 
| 221 | 
             
                            raw_text = f.read()
         | 
| 222 | 
            -
                        text = clean(raw_text, lower= | 
| 223 | 
             
                    elif file_path.suffix == ".pdf":
         | 
| 224 | 
            -
                        logging.info(f" | 
| 225 | 
             
                        conversion_stats = convert_PDF_to_Text(
         | 
| 226 | 
             
                            file_path,
         | 
| 227 | 
             
                            ocr_model=ocr_model,
         | 
| @@ -230,11 +244,11 @@ def load_uploaded_file(file_obj, max_pages=20): | |
| 230 | 
             
                        text = conversion_stats["converted_text"]
         | 
| 231 | 
             
                    else:
         | 
| 232 | 
             
                        logging.error(f"Unknown file type {file_path.suffix}")
         | 
| 233 | 
            -
                        text = "ERROR - check  | 
| 234 |  | 
| 235 | 
             
                    return text
         | 
| 236 | 
             
                except Exception as e:
         | 
| 237 | 
            -
                    logging. | 
| 238 | 
             
                    return "Error: Could not read file. Ensure that it is a valid text file with encoding UTF-8 if text, and a PDF if PDF."
         | 
| 239 |  | 
| 240 |  | 
|  | |
| 1 | 
            +
            """
         | 
| 2 | 
            +
            app.py - the main module for the gradio app
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            Usage:
         | 
| 5 | 
            +
                python app.py
         | 
| 6 | 
            +
            """
         | 
| 7 | 
             
            import contextlib
         | 
| 8 | 
             
            import logging
         | 
| 9 | 
             
            import os
         | 
|  | |
| 25 | 
             
            import nltk
         | 
| 26 | 
             
            import torch
         | 
| 27 | 
             
            from cleantext import clean
         | 
|  | |
| 28 | 
             
            from doctr.models import ocr_predictor
         | 
| 29 |  | 
| 30 | 
             
            from pdf2text import convert_PDF_to_Text
         | 
|  | |
| 33 |  | 
| 34 | 
             
            _here = Path(__file__).parent
         | 
| 35 |  | 
| 36 | 
            +
            nltk.download("stopwords", quiet=True)
         | 
| 37 |  | 
| 38 |  | 
| 39 | 
             
            MODEL_OPTIONS = [
         | 
|  | |
| 42 | 
             
                "pszemraj/long-t5-tglobal-base-sci-simplify-elife",
         | 
| 43 | 
             
                "pszemraj/long-t5-tglobal-base-16384-booksci-summary-v1",
         | 
| 44 | 
             
                "pszemraj/pegasus-x-large-book-summary",
         | 
| 45 | 
            +
            ]  # models users can choose from
         | 
| 46 |  | 
| 47 |  | 
| 48 | 
             
            def predict(
         | 
|  | |
| 51 | 
             
                token_batch_length: int = 1024,
         | 
| 52 | 
             
                empty_cache: bool = True,
         | 
| 53 | 
             
                **settings,
         | 
| 54 | 
            +
            ) -> list:
         | 
| 55 | 
            +
                """
         | 
| 56 | 
            +
                predict - helper fn to support multiple models for summarization at once
         | 
| 57 | 
            +
             | 
| 58 | 
            +
                :param str input_text: the input text to summarize
         | 
| 59 | 
            +
                :param str model_name: model name to use
         | 
| 60 | 
            +
                :param int token_batch_length: the length of the token batches to use
         | 
| 61 | 
            +
                :param bool empty_cache: whether to empty the cache before loading a new= model
         | 
| 62 | 
            +
                :return: list of dicts with keys "summary" and "score"
         | 
| 63 | 
            +
                """
         | 
| 64 | 
             
                if torch.cuda.is_available() and empty_cache:
         | 
| 65 | 
             
                    torch.cuda.empty_cache()
         | 
| 66 |  | 
|  | |
| 156 | 
             
                    token_batch_length=token_batch_length,
         | 
| 157 | 
             
                    **settings,
         | 
| 158 | 
             
                )
         | 
| 159 | 
            +
                sum_text = [
         | 
| 160 | 
            +
                    f"Batch {i}:\n\t" + s["summary"][0] for i, s in enumerate(_summaries, start=1)
         | 
| 161 | 
            +
                ]
         | 
| 162 | 
             
                sum_scores = [
         | 
| 163 | 
            +
                    f" - Batch Summary {i}: {round(s['summary_score'],4)}"
         | 
| 164 | 
             
                    for i, s in enumerate(_summaries)
         | 
| 165 | 
             
                ]
         | 
| 166 |  | 
|  | |
| 168 | 
             
                history["Summary Scores"] = "<br><br>"
         | 
| 169 | 
             
                scores_out = "\n".join(sum_scores)
         | 
| 170 | 
             
                rt = round((time.perf_counter() - st) / 60, 2)
         | 
| 171 | 
            +
                logging.info(f"Runtime: {rt} minutes")
         | 
| 172 | 
             
                html = ""
         | 
| 173 | 
            +
                html += f"<p>Runtime: {rt} minutes with model: {model_name}</p>"
         | 
| 174 | 
             
                if msg is not None:
         | 
| 175 | 
             
                    html += msg
         | 
| 176 |  | 
|  | |
| 185 | 
             
            def load_single_example_text(
         | 
| 186 | 
             
                example_path: str or Path,
         | 
| 187 | 
             
                max_pages=20,
         | 
| 188 | 
            +
            ) -> str:
         | 
| 189 | 
             
                """
         | 
| 190 | 
            +
                load_single_example_text - loads a single example text file
         | 
| 191 | 
            +
             | 
| 192 | 
            +
                :param strorPath example_path: name of the example to load
         | 
| 193 | 
            +
                :param int max_pages: the maximum number of pages to load from a PDF
         | 
| 194 | 
            +
                :return str: the text of the example
         | 
| 195 | 
             
                """
         | 
| 196 | 
             
                global name_to_path
         | 
| 197 | 
             
                full_ex_path = name_to_path[example_path]
         | 
|  | |
| 215 | 
             
                return text
         | 
| 216 |  | 
| 217 |  | 
| 218 | 
            +
            def load_uploaded_file(file_obj, max_pages: int = 20, lower: bool = False) -> str:
         | 
| 219 | 
             
                """
         | 
| 220 | 
            +
                load_uploaded_file - loads a file uploaded by the user
         | 
|  | |
|  | |
|  | |
| 221 |  | 
| 222 | 
            +
                :param file_obj (POTENTIALLY list): Gradio file object inside a list
         | 
| 223 | 
            +
                :param int max_pages: the maximum number of pages to load from a PDF
         | 
| 224 | 
            +
                :param bool lower: whether to lowercase the text
         | 
| 225 | 
            +
                :return str: the text of the file
         | 
| 226 | 
             
                """
         | 
|  | |
|  | |
|  | |
| 227 | 
             
                # check if mysterious file object is a list
         | 
| 228 | 
             
                if isinstance(file_obj, list):
         | 
| 229 | 
             
                    file_obj = file_obj[0]
         | 
| 230 | 
             
                file_path = Path(file_obj.name)
         | 
| 231 | 
             
                try:
         | 
| 232 | 
            +
                    logging.info(f"Loading file:\t{file_path}")
         | 
| 233 | 
             
                    if file_path.suffix == ".txt":
         | 
| 234 | 
             
                        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
         | 
| 235 | 
             
                            raw_text = f.read()
         | 
| 236 | 
            +
                        text = clean(raw_text, lower=lower)
         | 
| 237 | 
             
                    elif file_path.suffix == ".pdf":
         | 
| 238 | 
            +
                        logging.info(f"loading as PDF file {file_path}")
         | 
| 239 | 
             
                        conversion_stats = convert_PDF_to_Text(
         | 
| 240 | 
             
                            file_path,
         | 
| 241 | 
             
                            ocr_model=ocr_model,
         | 
|  | |
| 244 | 
             
                        text = conversion_stats["converted_text"]
         | 
| 245 | 
             
                    else:
         | 
| 246 | 
             
                        logging.error(f"Unknown file type {file_path.suffix}")
         | 
| 247 | 
            +
                        text = "ERROR - check file - unknown file type"
         | 
| 248 |  | 
| 249 | 
             
                    return text
         | 
| 250 | 
             
                except Exception as e:
         | 
| 251 | 
            +
                    logging.error(f"Trying to load file:\t{file_path},\nerror:\t{e}")
         | 
| 252 | 
             
                    return "Error: Could not read file. Ensure that it is a valid text file with encoding UTF-8 if text, and a PDF if PDF."
         | 
| 253 |  | 
| 254 |  | 
