Skip to content

Commit 0a593d5

Browse files
committed
srt to tsv
1 parent 2a6312a commit 0a593d5

File tree

1 file changed

+141
-0
lines changed

1 file changed

+141
-0
lines changed
Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
import re
2+
import sys
3+
import os
4+
5+
class Subtitle:
6+
def __init__(self, index, start_time, end_time, text_lines):
7+
self.index = index
8+
self.start_time = start_time
9+
self.end_time = end_time
10+
self.text = text_lines
11+
12+
def __repr__(self):
13+
return f"Subtitle(#{self.index}, Lines: {len(self.text)})"
14+
15+
def parse_srt_content(content):
16+
# Updated Regex:
17+
# (.*?) matches text greedily but stops at the lookahead.
18+
# (?=\n\s*\n\d+|\n\d+\s*\n|\Z) looks for the next block or end of file.
19+
pattern = re.compile(r'(\d+)\s+([\d:,]+) --> ([\d:,]+)\s*(.*?)(?=\n\d+\s*\n|\Z)', re.DOTALL)
20+
21+
subtitles = []
22+
for match in pattern.finditer(content):
23+
index = int(match.group(1))
24+
start_time = match.group(2)
25+
end_time = match.group(3)
26+
27+
# Capture raw text and clean it
28+
raw_text = match.group(4).strip()
29+
30+
# Handle non-breaking spaces (\xa0) and other whitespace
31+
# We split, strip, and filter out strings that are empty or just whitespace
32+
text_lines = [line.strip() for line in raw_text.split('\n')]
33+
text_lines = [line for line in text_lines if line and not line.isspace()]
34+
35+
# If the block was just whitespace/blank, text_lines will be an empty list []
36+
subtitles.append(Subtitle(index, start_time, end_time, text_lines))
37+
38+
return subtitles
39+
40+
def parse_srt_sequentially(file_path):
41+
subtitles = []
42+
43+
if not os.path.exists(file_path):
44+
return subtitles
45+
46+
with open(file_path, 'r', encoding='utf-8') as f:
47+
# Read lines and strip trailing whitespace/newlines
48+
lines = [line.rstrip('\n\r') for line in f]
49+
50+
current_index = None
51+
current_times = None
52+
current_text = []
53+
54+
i = 0
55+
while i < len(lines):
56+
line = lines[i].strip()
57+
58+
# 1. Look for the Index (must be a digit)
59+
if line.isdigit():
60+
# If we were already building a subtitle, save it before starting new
61+
if current_index is not None:
62+
subtitles.append(Subtitle(current_index, current_times[0], current_times[1], current_text))
63+
64+
current_index = int(line)
65+
current_text = []
66+
i += 1
67+
68+
# 2. The very next line MUST be the timestamp
69+
if i < len(lines) and "-->" in lines[i]:
70+
times = lines[i].split("-->")
71+
current_times = (times[0].strip(), times[1].strip())
72+
i += 1
73+
continue
74+
75+
# 3. Collect text lines
76+
# We ignore lines that are just whitespace or the non-breaking space \xa0
77+
clean_line = line.replace('\xa0', '').strip()
78+
if clean_line:
79+
current_text.append(clean_line)
80+
81+
i += 1
82+
83+
# Don't forget to add the last subtitle after the loop ends
84+
if current_index is not None:
85+
subtitles.append(Subtitle(current_index, current_times[0], current_times[1], current_text))
86+
87+
return subtitles
88+
89+
90+
def load_srt_file(file_path):
91+
"""Reads the file and returns a list of Subtitle objects."""
92+
try:
93+
with open(file_path, 'r', encoding='utf-8') as f:
94+
content = f.read()
95+
return parse_srt_content(content)
96+
except FileNotFoundError:
97+
print(f"Error: The file '{file_path}' was not found.")
98+
return []
99+
except Exception as e:
100+
print(f"An error occurred: {e}")
101+
return []
102+
103+
def clean_repeated_lines(subtitles):
104+
"""
105+
Checks if the last line of a subtitle object is the same as the
106+
first line of the subsequent subtitle object. If so, removes
107+
it from the current object.
108+
"""
109+
# We iterate up to len - 1 because the last item has no 'next' to compare to
110+
for i in range(len(subtitles) - 1):
111+
current_sub = subtitles[i]
112+
next_sub = subtitles[i + 1]
113+
114+
# Ensure both objects have text lines to compare
115+
if current_sub.text and next_sub.text:
116+
last_line_current = current_sub.text[-1]
117+
first_line_next = next_sub.text[0]
118+
119+
if last_line_current == first_line_next:
120+
# Remove the last element from the current list
121+
current_sub.text.pop()
122+
123+
return subtitles
124+
125+
def remove_empty_subtitles(subtitles):
126+
"""
127+
Returns a new list containing only subtitle objects
128+
that have at least one line of text.
129+
"""
130+
# This creates a new list including only objects where sub.text is not empty
131+
return [sub for sub in subtitles if len(sub.text) > 0]
132+
133+
# --- Execution ---
134+
# Replace 'your_file.srt' with the actual path to your subtitle file
135+
file_name = sys.argv[1]
136+
# subtitle_objects = load_srt_file(file_name)
137+
subtitle_objects = parse_srt_sequentially(file_name)
138+
139+
# Verify the result
140+
for sub in remove_empty_subtitles(clean_repeated_lines(subtitle_objects)):
141+
print(f"[{sub.index}]\t{sub.start_time}\t{sub.end_time}\t{"".join(sub.text)}")

0 commit comments

Comments
 (0)