Welcome to Etherpad!
This pad text is synchronized as you type, so that everyone viewing this page sees the same text. This allows you to collaborate seamlessly on documents!
Get involved with Etherpad at
http://etherpad.org
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import random
import re
from main.settings import db
import srt
import datetime
from pattern.en import parse
## Name of original video
videofile = 'input/weekly-address.mp4'
srtfile = 'input/subtitle.vtt'
def convert_time(timestring):
""" Converts a string into seconds """
nums = map(float, re.findall(r'\d+', timestring))
return 3600*nums[0] + 60*nums[1] + nums[2] + nums[3]/1000
def parse_vtt (lines):
timed_texts = []
current_times , current_text = None, ""
for line in lines:
times = re.findall("[0-9]*:[0-9]*:[0-9]*\.[0-9]*", line)
## Found a timecode
if times != []:
current_times = map(convert_time, times)
## Only a line-break. Add text and timecode to times_texts
elif re.match(r"^\r?\n$", line) and current_times is not None:
timed_texts.append((current_times, re.sub(r'\s$', '', current_text)))
current_times, current_text = None, ""
## Extend texts
elif current_times is not None:
current_text = current_text + re.sub(r'\r?\n',' ', line)
return timed_texts
for row in parse_vtt(open(srtfile).readlines()):
sentence = {
'filename': videofile,
'text': row[1],
'start': row[0][0],
'end': row[0][1],
'duration': row[0][1] - row[0][0],
'words': []
}
char_duration = sentence['duration'] / len(sentence['text'])
offset = 0
parsed_sentence = parse(sentence['text'], Tokenize=True, chunks=False)
index = 0
for chunk in parsed_sentence.split()[0]:
word = chunk[0]
tag = chunk[1]
index = sentence['text'].find(word, index)
word_start = index * char_duration
word_duration = len(word) * char_duration
word_end = word_start + word_duration
sentence['words'].append({
'word': word,
'start': sentence['start'] + word_start,
'end': sentence['start'] + word_end,
'duration': word_duration,
'tag': tag,
})
db.sentences.insert(sentence)