<?php

    // this is to test and build clean titles of results to display in page desc meta and content desc
    // reads dirty titles fom "sample-dirty-titles.txt

    // read input dirty file and store in array

    $fmeta_a = file("sample-dirty-titles.txt", FILE_IGNORE_NEW_LINES);
    $lcount = count($fmeta_a);
    $output_a = null;

    //send back piped string of all titles
    $output_piped = '';

    // for each title
    for ($i = 0; $i < $lcount; $i++) {

        // any protected words which need to be removed form title text...(* use lowercase keywords)
        $strip_words_a = ['superteacherworksheets', 'super teacher worksheets', 'super teacher', 'superteacher', 'scholastic', 'copyright', '@', 'ltd', 'https', 'http', 'www'];

        // title string
        $string = $fmeta_a[$i];
        // remove more than one space between words
        $string = preg_replace('/\s\s+/', ' ', $string);

        // remove domain urls using regex
        $string = preg_replace('/\b((https?|ftp|file):\/\/|www\.)[-A-Z0-9+&@#\/%?=~_|$!:,.;]*[A-Z0-9+&@#\/%=~_|$]/i', ' ', $string);

        //remove any domains without protocol by splitting title into word array and checking string with . (dot) in the middle and removing them.
        $string_a = explode(' ', $string);
        $ecount = count($string_a);
        $string = '';

        for($e = 0; $e < $ecount; $e++) {

            $string_e = $string_a[$e];
            // if word contains .(dot), remove it
            if(strpos($string_e, '.') == false) {
                // remove special chars other than alphabets, numbers and dash
                $string_e = preg_replace('/[^A-Za-z0-9\-]/', '', $string_e); // Removes special chars.
                //fix any words with same letter repeating more than 2 times
                $string_e = preg_replace("/(.)\\1{2,}/", "$1", $string_e);
                //rebuild the title string
                if(trim($string_e)) {
                    $string .= ' ' . $string_e;
                }
            }
        }
        //echo $string . ' <br>';
        $string = strip_tags($string);
        $string = str_replace(' ', '-', $string); // Replaces all spaces with hyphens.

        // replace dash with space
        $string = preg_replace('/-+/', ' ', $string);

        // remove any incriminating keywords
        $string = strtolower($string); // convert to lower case
        $cnt_strip = count($strip_words_a);

        for($s = 0; $s < $cnt_strip; $s++) {
            $string = str_replace($strip_words_a[$s], ' ', $string);
        }
        // convert to capital letter first words
        $string = ucwords($string);
        // remove more than one space between words
        $string = preg_replace('/\s\s+/', ' ', $string);
        $string = trim($string);

        // build piped output
        if($string) {
            $output_piped .= $string . '|';
        }

        echo $fmeta_a[$i] . '<br>';
        echo $string . '<br><br>';

    }
    // remove last pipe
    $output_piped = rtrim($output_piped, '|');
    echo $output_piped;