1+ import re
2+ import sys
3+ import os
4+
5+ class Subtitle :
6+ def __init__ (self , index , start_time , end_time , text_lines ):
7+ self .index = index
8+ self .start_time = start_time
9+ self .end_time = end_time
10+ self .text = text_lines
11+
12+ def __repr__ (self ):
13+ return f"Subtitle(#{ self .index } , Lines: { len (self .text )} )"
14+
15+ def parse_srt_content (content ):
16+ # Updated Regex:
17+ # (.*?) matches text greedily but stops at the lookahead.
18+ # (?=\n\s*\n\d+|\n\d+\s*\n|\Z) looks for the next block or end of file.
19+ pattern = re .compile (r'(\d+)\s+([\d:,]+) --> ([\d:,]+)\s*(.*?)(?=\n\d+\s*\n|\Z)' , re .DOTALL )
20+
21+ subtitles = []
22+ for match in pattern .finditer (content ):
23+ index = int (match .group (1 ))
24+ start_time = match .group (2 )
25+ end_time = match .group (3 )
26+
27+ # Capture raw text and clean it
28+ raw_text = match .group (4 ).strip ()
29+
30+ # Handle non-breaking spaces (\xa0) and other whitespace
31+ # We split, strip, and filter out strings that are empty or just whitespace
32+ text_lines = [line .strip () for line in raw_text .split ('\n ' )]
33+ text_lines = [line for line in text_lines if line and not line .isspace ()]
34+
35+ # If the block was just whitespace/blank, text_lines will be an empty list []
36+ subtitles .append (Subtitle (index , start_time , end_time , text_lines ))
37+
38+ return subtitles
39+
40+ def parse_srt_sequentially (file_path ):
41+ subtitles = []
42+
43+ if not os .path .exists (file_path ):
44+ return subtitles
45+
46+ with open (file_path , 'r' , encoding = 'utf-8' ) as f :
47+ # Read lines and strip trailing whitespace/newlines
48+ lines = [line .rstrip ('\n \r ' ) for line in f ]
49+
50+ current_index = None
51+ current_times = None
52+ current_text = []
53+
54+ i = 0
55+ while i < len (lines ):
56+ line = lines [i ].strip ()
57+
58+ # 1. Look for the Index (must be a digit)
59+ if line .isdigit ():
60+ # If we were already building a subtitle, save it before starting new
61+ if current_index is not None :
62+ subtitles .append (Subtitle (current_index , current_times [0 ], current_times [1 ], current_text ))
63+
64+ current_index = int (line )
65+ current_text = []
66+ i += 1
67+
68+ # 2. The very next line MUST be the timestamp
69+ if i < len (lines ) and "-->" in lines [i ]:
70+ times = lines [i ].split ("-->" )
71+ current_times = (times [0 ].strip (), times [1 ].strip ())
72+ i += 1
73+ continue
74+
75+ # 3. Collect text lines
76+ # We ignore lines that are just whitespace or the non-breaking space \xa0
77+ clean_line = line .replace ('\xa0 ' , '' ).strip ()
78+ if clean_line :
79+ current_text .append (clean_line )
80+
81+ i += 1
82+
83+ # Don't forget to add the last subtitle after the loop ends
84+ if current_index is not None :
85+ subtitles .append (Subtitle (current_index , current_times [0 ], current_times [1 ], current_text ))
86+
87+ return subtitles
88+
89+
90+ def load_srt_file (file_path ):
91+ """Reads the file and returns a list of Subtitle objects."""
92+ try :
93+ with open (file_path , 'r' , encoding = 'utf-8' ) as f :
94+ content = f .read ()
95+ return parse_srt_content (content )
96+ except FileNotFoundError :
97+ print (f"Error: The file '{ file_path } ' was not found." )
98+ return []
99+ except Exception as e :
100+ print (f"An error occurred: { e } " )
101+ return []
102+
103+ def clean_repeated_lines (subtitles ):
104+ """
105+ Checks if the last line of a subtitle object is the same as the
106+ first line of the subsequent subtitle object. If so, removes
107+ it from the current object.
108+ """
109+ # We iterate up to len - 1 because the last item has no 'next' to compare to
110+ for i in range (len (subtitles ) - 1 ):
111+ current_sub = subtitles [i ]
112+ next_sub = subtitles [i + 1 ]
113+
114+ # Ensure both objects have text lines to compare
115+ if current_sub .text and next_sub .text :
116+ last_line_current = current_sub .text [- 1 ]
117+ first_line_next = next_sub .text [0 ]
118+
119+ if last_line_current == first_line_next :
120+ # Remove the last element from the current list
121+ current_sub .text .pop ()
122+
123+ return subtitles
124+
125+ def remove_empty_subtitles (subtitles ):
126+ """
127+ Returns a new list containing only subtitle objects
128+ that have at least one line of text.
129+ """
130+ # This creates a new list including only objects where sub.text is not empty
131+ return [sub for sub in subtitles if len (sub .text ) > 0 ]
132+
133+ # --- Execution ---
134+ # Replace 'your_file.srt' with the actual path to your subtitle file
135+ file_name = sys .argv [1 ]
136+ # subtitle_objects = load_srt_file(file_name)
137+ subtitle_objects = parse_srt_sequentially (file_name )
138+
139+ # Verify the result
140+ for sub in remove_empty_subtitles (clean_repeated_lines (subtitle_objects )):
141+ print (f"[{ sub .index } ]\t { sub .start_time } \t { sub .end_time } \t { "" .join (sub .text )} " )
0 commit comments