#! /usr/bin/env python
# -*- coding: utf-8 -*-
import random
import re
from main.settings import db
import srt
import datetime
from pattern.en import parse
## Name of original video
videofile = 'input/weekly-address.mp4'
srtfile = 'input/subtitle.vtt'
def convert_time(timestring):
""" Converts a string into seconds """
nums = map(float, re.findall(r'\d+', timestring))
return 3600*nums[0] + 60*nums[1] + nums[2] + nums[3]/1000
def parse_vtt (lines):
timed_texts = []
current_times , current_text = None, ""
for line in lines:
times = re.findall("[0-9]*:[0-9]*:[0-9]*\.[0-9]*", line)
## Found a timecode
if times != []:
current_times = map(convert_time, times)
## Only a line-break. Add text and timecode to times_texts
elif re.match(r"^\r?\n$", line) and current_times is not None:
timed_texts.append((current_times, re.sub(r'\s$', '', current_text)))
current_times, current_text = None, ""
## Extend texts
elif current_times is not None:
current_text = current_text + re.sub(r'\r?\n',' ', line)
return timed_texts
for row in parse_vtt(open(srtfile).readlines()):
sentence = {
'filename': videofile,
'text': row[1],
'start': row[0][0],
'end': row[0][1],
'duration': row[0][1] - row[0][0],
'words': []
}
char_duration = sentence['duration'] / len(sentence['text'])
offset = 0
parsed_sentence = parse(sentence['text'], Tokenize=True, chunks=False)
index = 0
for chunk in parsed_sentence.split()[0]:
word = chunk[0]
tag = chunk[1]
index = sentence['text'].find(word, index)
word_start = index * char_duration
word_duration = len(word) * char_duration
word_end = word_start + word_duration
sentence['words'].append({
'word': word,
'start': sentence['start'] + word_start,
'end': sentence['start'] + word_end,
'duration': word_duration,
'tag': tag,
})
db.sentences.insert(sentence)