Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	Upload utils_chapters.py
Browse files- src/test/utils_chapters.py +79 -0
    	
        src/test/utils_chapters.py
    ADDED
    
    | @@ -0,0 +1,79 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import re
         | 
| 2 | 
            +
             | 
| 3 | 
            +
             | 
| 4 | 
            +
            def extract_chapters(output: str | list[str]):
         | 
| 5 | 
            +
                """
         | 
| 6 | 
            +
                Extract chapters from the given output string or list of strings.
         | 
| 7 | 
            +
             | 
| 8 | 
            +
                Args:
         | 
| 9 | 
            +
                    output (str | list[str]): The input text containing chapter information.
         | 
| 10 | 
            +
                    vid_duration (str | None): The video duration in hh:mm:ss format. Default is None.
         | 
| 11 | 
            +
             | 
| 12 | 
            +
                Returns:
         | 
| 13 | 
            +
                    dict: A dictionary of extracted chapters with timestamps as keys and titles as values.
         | 
| 14 | 
            +
                """
         | 
| 15 | 
            +
             | 
| 16 | 
            +
                # Only capture the first timestamp (hh:mm:ss) and ignore the second.
         | 
| 17 | 
            +
                pattern = r"(\d{2}:[0-5]\d:[0-5]\d)\b"
         | 
| 18 | 
            +
                chapters = {}
         | 
| 19 | 
            +
             | 
| 20 | 
            +
                if isinstance(output, str):
         | 
| 21 | 
            +
                    output = output.split("\n")
         | 
| 22 | 
            +
             | 
| 23 | 
            +
                for line in output:
         | 
| 24 | 
            +
                    if len(line) == 0:
         | 
| 25 | 
            +
                        continue
         | 
| 26 | 
            +
             | 
| 27 | 
            +
                    match = re.search(pattern, line)
         | 
| 28 | 
            +
                    if match:
         | 
| 29 | 
            +
                        time = match.group(1)
         | 
| 30 | 
            +
                        # Strip any additional timestamp or text following it
         | 
| 31 | 
            +
                        title = re.sub(pattern, "", line).strip()
         | 
| 32 | 
            +
                        title = title.lstrip(" -:")  # Remove leading dash, colon, or space
         | 
| 33 | 
            +
                        title = title.strip()
         | 
| 34 | 
            +
                        if len(title) > 0:
         | 
| 35 | 
            +
                            chapters[time] = title
         | 
| 36 | 
            +
             | 
| 37 | 
            +
                return chapters
         | 
| 38 | 
            +
             | 
| 39 | 
            +
             | 
| 40 | 
            +
            def filter_chapters(chapters: dict, vid_duration: str | None = None):
         | 
| 41 | 
            +
                if vid_duration:
         | 
| 42 | 
            +
                    filter_chapters = {}
         | 
| 43 | 
            +
                    for k, v in sorted(chapters.items()):
         | 
| 44 | 
            +
                        if k > vid_duration:
         | 
| 45 | 
            +
                            break
         | 
| 46 | 
            +
                        filter_chapters[k] = v
         | 
| 47 | 
            +
                    chapters = filter_chapters
         | 
| 48 | 
            +
             | 
| 49 | 
            +
                # Check if chapters are in ordered by time
         | 
| 50 | 
            +
                times = list(chapters.keys())
         | 
| 51 | 
            +
                for i in range(1, len(times)):
         | 
| 52 | 
            +
                    if times[i] < times[i - 1]:
         | 
| 53 | 
            +
                        return {}
         | 
| 54 | 
            +
             | 
| 55 | 
            +
                # remove empty chapters
         | 
| 56 | 
            +
                chapters = {k: v for k, v in chapters.items() if len(v) > 0}
         | 
| 57 | 
            +
             | 
| 58 | 
            +
                # if only one chapter at 00:00:00, return empty dict
         | 
| 59 | 
            +
                if len(chapters) == 1 and list(chapters.keys())[0] == "00:00:00":
         | 
| 60 | 
            +
                    return {}
         | 
| 61 | 
            +
             | 
| 62 | 
            +
                return chapters
         | 
| 63 | 
            +
             | 
| 64 | 
            +
             | 
| 65 | 
            +
            if __name__ == "__main__":
         | 
| 66 | 
            +
                # Example usage
         | 
| 67 | 
            +
                text = """
         | 
| 68 | 
            +
                00:00:00 Introduction - good
         | 
| 69 | 
            +
                00:05:30 - 00:05:33: Second Chapter
         | 
| 70 | 
            +
                00:05:33: Another Chapter
         | 
| 71 | 
            +
                00:90:00 - Wrong time
         | 
| 72 | 
            +
                00:42:00 - After video duration
         | 
| 73 | 
            +
                00:39:00 - What is this?
         | 
| 74 | 
            +
                01:04:00 - Outside of video duration
         | 
| 75 | 
            +
                """
         | 
| 76 | 
            +
                chapters = extract_chapters(text)
         | 
| 77 | 
            +
                chapters = filter_chapters(chapters, vid_duration="00:40:00")
         | 
| 78 | 
            +
                for time, title in chapters.items():
         | 
| 79 | 
            +
                    print(f"Time: {time}, Title: {title}")
         |