#! /usr/bin/env python # -*- coding: utf-8 -*- import random import re from main.settings import db import srt import datetime from pattern.en import parse ## Name of original video videofile = 'input/weekly-address.mp4' srtfile = 'input/subtitle.vtt' def convert_time(timestring): """ Converts a string into seconds """ nums = map(float, re.findall(r'\d+', timestring)) return 3600*nums[0] + 60*nums[1] + nums[2] + nums[3]/1000 def parse_vtt (lines): timed_texts = [] current_times , current_text = None, "" for line in lines: times = re.findall("[0-9]*:[0-9]*:[0-9]*\.[0-9]*", line) ## Found a timecode if times != []: current_times = map(convert_time, times) ## Only a line-break. Add text and timecode to times_texts elif re.match(r"^\r?\n$", line) and current_times is not None: timed_texts.append((current_times, re.sub(r'\s$', '', current_text))) current_times, current_text = None, "" ## Extend texts elif current_times is not None: current_text = current_text + re.sub(r'\r?\n',' ', line) return timed_texts for row in parse_vtt(open(srtfile).readlines()): sentence = { 'filename': videofile, 'text': row[1], 'start': row[0][0], 'end': row[0][1], 'duration': row[0][1] - row[0][0], 'words': [] } char_duration = sentence['duration'] / len(sentence['text']) offset = 0 parsed_sentence = parse(sentence['text'], Tokenize=True, chunks=False) index = 0 for chunk in parsed_sentence.split()[0]: word = chunk[0] tag = chunk[1] index = sentence['text'].find(word, index) word_start = index * char_duration word_duration = len(word) * char_duration word_end = word_start + word_duration sentence['words'].append({ 'word': word, 'start': sentence['start'] + word_start, 'end': sentence['start'] + word_end, 'duration': word_duration, 'tag': tag, }) db.sentences.insert(sentence)