Search when iconv fails

Continuing the discussion from Is this correct code?:

This discussion caught my interest and because it has been a while since I worked with iconv() I gave it another go.

Instead of using a database I created arrays to use as the strings source.

I created $usa_cities_arr from https://simple.wikipedia.org/wiki/List_of_United_States_cities_by_population (272 pairs) and also

$spain_cities_arr = array(
			1 => array("Madrid", "Madrid")
			,2 => array("Barcelona", "Catalonia")
			,3 => array("Valencia", "Valencia")
			,4 => array("Seville", "Andalusia")
			,5 => array("Zaragoza", "Aragon")
			,6 => array("Málaga", "Andalusia")
			,7 => array("Murcia", "Murcia")
			,8 => array("Palma de Mallorca", "Balearic Islands")
			,9 => array("Las Palmas de Gran Canaria", "Canary Islands")
			,10 => array("Bilbao", "Basque Country")
			);
					
$germany_cities_arr = array(
			1 => array("Berlin", "Berlin")
			,2 => array("Hamburg", "Hamburg")
			,3 => array("Munich", "Bavaria")
			,4 => array("Cologne", "North Rhine-Westphalia")
			,5 => array("Frankfurt", "Hesse")
			,6 => array("Essen", "North Rhine-Westphalia")
			,7 => array("Dortmund", "North Rhine-Westphalia")
			,8 => array("Stuttgart", "Baden-Württemberg")
			,9 => array("Düsseldorf", "North Rhine-Westphalia")
			,10 => array("Bremen", "Bremen")
			,11 => array("Hanover", "Lower Saxony")
			,12 => array("Duisburg", "North Rhine-Westphalia")
			,13 => array("Nuremberg", "Bavaria")
			,14 => array("Leipzig", "Saxony")
			,15 => array("Dresden", "Saxony")
			,16 => array("Bochum", "North Rhine-Westphalia")
			,17 => array("Wuppertal", "North Rhine-Westphalia")
			,18 => array("Bielefeld", "North Rhine-Westphalia")
			,19 => array("Bonn", "North Rhine-Westphalia")
			,20 => array("Mannheim", "Baden-Württemberg")
			,21 => array("Karlsruhe", "Baden-Württemberg")
			,22 => array("Gelsenkirchen", "North Rhine-Westphalia")
			,23 => array("Wiesbaden", "Hesse")
			,24 => array("Münster", "North Rhine-Westphalia")
			,25 => array("Mönchengladbach", "North Rhine-Westphalia")
			,26 => array("Chemnitz", "Saxony")
			,27 => array("Augsburg", "Bavaria")
			,28 => array("Braunschweig", "Lower Saxony")
			,29 => array("Aachen", "North Rhine-Westphalia")
			,30 => array("Krefeld", "North Rhine-Westphalia")
			,31 => array("Halle", "Saxony-Anhalt")
			,32 => array("Kiel", "Schleswig-Holstein")
			,33 => array("Magdeburg", "Saxony-Anhalt")
			,34 => array("Oberhausen", "North Rhine-Westphalia")
			,35 => array("Lübeck", "Schleswig-Holstein")
			,36 => array("Freiburg", "Baden-Württemberg")
			,37 => array("Hagen", "North Rhine-Westphalia")
			,38 => array("Erfurt", "Thuringia")
			,39 => array("Kassel", "Hesse")
			,40 => array("Rostock", "Mecklenburg-Vorpommern")
			,41 => array("Mainz", "Rhineland-Palatinate")
			,42 => array("Hamm", "North Rhine-Westphalia")
			,43 => array("Saarbrücken", "Saarland")
			,44 => array("Herne", "North Rhine-Westphalia")
			,45 => array("Mülheim an der Ruhr", "North Rhine-Westphalia")
			,46 => array("Solingen", "North Rhine-Westphalia")
			,47 => array("Osnabrück", "Lower Saxony")
			,48 => array("Ludwigshafen am Rhein", "Rhineland-Palatinate")
			,49 => array("Leverkusen", "North Rhine-Westphalia")
			,50 => array("Oldenburg", "Lower Saxony")
			);

then merged them

$all_cities_arr = array_merge($usa_cities_arr, $spain_cities_arr, $germany_cities_arr);	

One problem was that passing words to iconv() errorred because it saw strings that contain mostly ASCII as ASCII even if one character was UTF-8
Using String functions on the words caused problems because some UTF-8 characters were 2 bytes and the String function split the one character into two
Realizing this made me think of the mb_ (multibyte) functions

After much trial and error I came up with

function check_levenshtein_similar($haystack, $needle) {
  $needle_length = strlen($needle);
  $haystack_length = strlen($haystack);
  if ($needle_length >= $haystack_length) {
    $levenshtein_distance = levenshtein($needle, $haystack);
    similar_text($needle, $haystack, $similar_text_percent);
	if ( ($levenshtein_distance <= 2) || ( floor($similar_text_percent) >= 50) ) {
	  return true; 
	}
  } else {
    $offset = 0;
	while ($needle_length <= $haystack_length) {
      $haystack_substr = substr($haystack, $offset, $needle_length); 
      $levenshtein_distance = levenshtein($needle, $haystack_substr);
      similar_text($needle, $haystack_substr, $similar_text_percent);
	  if ( ($levenshtein_distance <= 2) || ( floor($similar_text_percent) >= 50) ) {
	    return true; 
	  } else {
	    $offset += 1; 
	    $haystack_length -= 1;
	  }
	}
  }
  return false;
}

function mbStringToArray ($string) {
    $strlen = mb_strlen($string, "ASCII");
    while ($strlen) {
        $array[] = mb_substr($string,0,1,"ASCII");
        $string = mb_substr($string,1,$strlen,"ASCII");
        $strlen = mb_strlen($string, "ASCII");
    }
    return $array;
} 			
				
function find_city($haystack, $needle) {
  foreach ($haystack as $city_state_arr) {
    $split_city_str = mbStringToArray($city_state_arr[0]); 
    $split_state_str = mbStringToArray($city_state_arr[1]);
    $converted_city_str = "";
    $converted_state_str = ""; 
	
    foreach ($split_city_str as $city_letter) {
      if ( !mb_check_encoding($city_letter, "ASCII") ) {
        $city_letter = mb_convert_encoding($city_letter, "ASCII", "UTF-8");
        $converted_city_str .= $city_letter;
      } else {
        $converted_city_str .= $city_letter;
      }  
    }

    foreach ($split_state_str as $state_letter) {
      if ( !mb_check_encoding($state_letter, "ASCII") ) {
        $state_letter = mb_convert_encoding($state_letter, "ASCII", "UTF-8");
        $converted_state_str .= $state_letter;
      } else {
        $converted_state_str .= $state_letter;
      }  
    }
  
    $city_state_arr[0] = mb_convert_encoding($city_state_arr[0], "UTF-8", "ASCII");
    $city_state_arr[1] = mb_convert_encoding($city_state_arr[1], "UTF-8", "ASCII");
	if ( ( stripos($converted_city_str, $needle) !== false )
		|| ( stripos($converted_state_str, $needle) !== false ) 
		|| ( check_levenshtein_similar($converted_city_str, $needle) ) 
		|| ( check_levenshtein_similar($converted_state_str, $needle) ) ) {
	  echo $city_state_arr[0] . " " . $city_state_arr[1] . "<br />";
	}
  }
}  

$needle is an unsafe string from a form. So it would need to be processed for use in the wild.
And it feels hackish. Some things could be better named, the code could be more elegant, and the levenshtein() and similar_text() values need tweaking to reduce false positives when $needle is a short string.

I can’t help thinking that mapping 2 byte UTF-8 characters to ASCII replacements would be easier, but I wanted to tackle it without for the learning experience.

I know @John_Betong has been working with metaphone() so I figured I’d try it.

I like it a lot! By putting it first in the function it cuts down on the expense incurred by levenshtein() and similar_text() in many cases.

function check_levenshtein_similar($haystack, $needle) {
  $meta_haystack = metaphone($haystack);
  $meta_needle = metaphone($needle);
  if ($meta_haystack === $meta_needle) {
    return true;
  }
...

I also put in some “sanity” checks before calling the function

if ( isset($_POST['search_string']) 
	&& ( trim($_POST['search_string']) != "")
	&& ( strlen(trim($_POST['search_string'])) > 2 ) ) {
  find_city( $all_cities_arr, trim($_POST['search_string']) );
}

I have yet to experiment with the optional levenshtein() cost parameters or the optional similar_text() phonemes parameter to see if using them would help the script’s run time, but hope to ASAP

Hi @Mittineague

I have made changes to the previous search routine and it now includes the following:

  1. Exact Match

  2. Single Partial Match using Php metaphone(…)

  3. Double Partial Match Separated by a Space

Demo with Super Partial Search

Follow the suggestions on the left.

Please email for source code and data.

Hi @Mittineague

I updated the above link and the Partial Metaphone Search works a lot better.

I was on a roll so decided to introduce Ajax and now there is a Live Metaphone Search

The latter is a bit rough and ready behind the scenes and hope to debug tomorrow.

After being side-tracked and delayed by a frustrating character encoding problem Can't Match Some (Foreign) Films After Table Convert to UTF8 (General or Unicode) - #19 by Mittineague I decided to put my arrays into a database.

For fun I used mysqli_ procedural, mysqli_ OOP, and PDO

Despite my being a relative newbie at Normalization, for example purposes I’ll digress and post them.

First up, mysqli_ procedural

$db_connection = mysqli_connect(HOST, USERNAME, PASSWORD, DATABASE);
if (mysqli_connect_errno()) {
    printf("Connect failed: %s\n", mysqli_connect_error());
    exit();
}
$create_countries_table_query = "CREATE TABLE IF NOT EXISTS `" . $countries_table_name
				. "` (
				`id` TINYINT UNSIGNED NOT NULL
				, `name` VARCHAR(100) NOT NULL
				, PRIMARY KEY (`id`)
				);";
$create_countries_table_result = mysqli_query($db_connection, $create_countries_table_query);
if (!$create_countries_table_result) {
  echo "create_countries_table_result FAIL " . mysqli_error($db_connection) . "<br/><br/>";
}  

$create_states_table_query = "CREATE TABLE IF NOT EXISTS `" . $states_table_name
				. "` (
				`name` VARCHAR(100) NOT NULL
				, `country_id` TINYINT UNSIGNED NOT NULL
				, `phoneme` VARCHAR(100) NOT NULL
				, PRIMARY KEY (`name`)
				);";
$create_states_table_result = mysqli_query($db_connection, $create_states_table_query);
if (!$create_states_table_result) {
  echo "create_states_table_result FAIL " . mysqli_error($db_connection) . "<br/><br/>";
}  

$create_cities_table_query = "CREATE TABLE IF NOT EXISTS `" . $cities_table_name
				. "` (
				`name` VARCHAR(100) NOT NULL
				, `state_name` VARCHAR(100) NOT NULL
				, `phoneme` VARCHAR(100) NOT NULL
				, PRIMARY KEY (`name`, `state_name`)
				);";
$create_cities_table_result = mysqli_query($db_connection, $create_cities_table_query);
if (!$create_cities_table_result) {
  echo "create_cities_table_result FAIL " . mysqli_error($db_connection) . "<br/><br/>";
}  
mysqli_close($db_connection);

Of course these are simple CREATE TABLE queries, but notice that
mysqli_connect() differs from mysql_connect() in that it takes the database name as a parameter and does not need an equivalent mysql_select_db() function to create the resource.
mysqli_query() differs from mysql_query() in that the resource and query parameters are reversed

I may need to run some ALTER TABLE queries at some point, but this has been good enough for a start for my purposes.

I like this thread. I don’t necessarily understand it in too great a level of detail, but it looks like it’s coming up with something worthwhile. Anyway, carry on…

1 Like

To do the INSERTs I used mysqli_ OOP

$all_countries = array('Germany', 'Italy', 'Norway', 'Scotland', 'Spain', 'Sweden', 'USA', 'Vietnam', 'Yemen');

require "src-data/germany-cities.php"; // $germany_cities_arr		0
require "src-data/italy-cities.php"; // $italy_cities_arr		1
require "src-data/norway-cities.php"; // $norway_cities_arr		2
require "src-data/scotland-cities.php"; // $scotland_cities_arr		3
require "src-data/spain-cities.php"; // $spain_cities_arr		4
require "src-data/sweden-cities.php"; // $sweden_cities_arr		5
require "src-data/usa-cities.php"; // $usa_cities_arr			6
require "src-data/vietnam-cities.php"; // $vietnam_cities_arr		7
require "src-data/yemen-cities.php"; // $yemen_cities_arr		8
$all_arrays = array($germany_cities_arr, $italy_cities_arr, $norway_cities_arr
		, $scotland_cities_arr, $spain_cities_arr, $sweden_cities_arr, $usa_cities_arr);
$mysqli = new mysqli(HOST, USERNAME, PASSWORD, DATABASE);
if ($mysqli->connect_error) {
  die('Connect Error (' . $mysqli->connect_errno . ') ' . $mysqli->connect_error);
}
$mysqli->set_charset("utf8");
 
$insert_countries_query = "INSERT INTO `countries` (`id`, `name`) VALUES (?, ?)";
$insert_countries_stmt = $mysqli->prepare($insert_countries_query);
foreach ($all_countries as $arr_key => $country_name) {
  $insert_countries_stmt->bind_param("is", $arr_key, $country_name);
  $insert_countries_stmt->execute();
  echo $mysqli->error . " .<br />";
}
$insert_countries_stmt->close();
echo $mysqli->error . " .<br />";

try {
  $insert_states_query = "INSERT INTO `states` (`name`, `country_id`, `phoneme`) VALUES (?, ?, ?)";
  $insert_states_stmt = $mysqli->prepare($insert_states_query);
  $insert_cities_query = "INSERT INTO `cities` (`name`, `state_name`, `phoneme`) VALUES (?, ?, ?)";
  $insert_cities_stmt = $mysqli->prepare($insert_cities_query);
}
catch (Exception $ex) {
  echo "Q-S Exception " . $ex->getMessage() . "<br/>"; 
}  
catch (Error $er) {
  echo "Q-S Error " . $er->getMessage() . "<br/>"; 
}

foreach ($all_arrays as $all_arr_key => $country_arr) {
  foreach ($country_arr as $city_state_vals) {
    try {
//	  $state_name = mb_convert_encoding($city_state_vals[1], "UTF-8", "ASCII");
//	  $city_name = mb_convert_encoding($city_state_vals[0], "UTF-8", "ASCII");
	  $state_name = $city_state_vals[1];
	  $city_name = $city_state_vals[0];
      $state_metaphone = metaphone($state_name);
      $city_metaphone = metaphone($city_name);
	  
      $insert_states_stmt->bind_param("sis", $state_name, $all_arr_key, $state_metaphone );
      $insert_states_stmt->execute();
    }
	catch (Exception $ex) {
      echo "State Exception " . $ex->getMessage() . "<br/>"; 
	}  
	catch (Error $er) {
      echo "State Error " . $er->getMessage() . "<br/>"; 
    }
    try {
      $insert_cities_stmt->bind_param("sss", $city_name, $state_name, $city_metaphone );
      $insert_cities_stmt->execute();
    }
	catch (Exception $ex) {
      echo "City Exception " . $ex->getMessage() . "<br/>"; 
    }
	catch (Error $er) {
      echo "City Error " . $er->getMessage() . "<br/>"; 
    }
  }
}
try {
  $insert_states_stmt->close();
  echo $mysqli->error . " ... ";
  $insert_cities_stmt->close();
  echo $mysqli->error . " ... ";
} 
catch (Exception $ex) {
  echo "Close Exception " . $ex->getMessage() . "<br/>"; 
}
catch (Error $er) {
  echo "Close Error " . $er->getMessage() . "<br/>"; 
}
$mysqli->close();

Noticeable differences here are
Because the mysqli object contains the connection as a property, there is no need to pass it in function calls.
The mysqli(object)->method(function) is different syntax, but not all that difficult to get used to once you’ve used it for a while.
A big improvement is the use of “?” placeholders and prepared statements.

Since I’m the only one using the code I hope I can trust myself to not try anything malicious, :wink:
but bound parameters are a good, if not critical, practice to get in the habit of using for any and all user supplied input.

1 Like

Eventually:
Ajax Dynamic Search Demo - source code supplied.

For the SELECT query I used PDO
You can see that this was a “first effort” - with messy naming, commented out lines etc. while I was working up the code in attempt to get what I wanted.
But before I get back on-topic here it is anyway.

  $pdo_mysql = new PDO('mysql:host=' . HOST . ';dbname=' . DATABASE , USERNAME, PASSWORD);
  $pdo_mysql->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_WARNING);
  $pdo_mysql->setAttribute(PDO::ATTR_DEFAULT_FETCH_MODE, PDO::FETCH_ASSOC); 
//  $pdo_stmt = $pdo_mysql->prepare("SELECT `id`, `name` FROM `countries`");
//  $pdo_stmt = $pdo_mysql->prepare("SELECT `name`, `country_id`, `phoneme` FROM `states`");
//  $pdo_stmt = $pdo_mysql->prepare("SELECT `name`, `state_name`, `phoneme` FROM `cities`");
  $pdo_stmt = $pdo_mysql->prepare("SELECT `countries`.`name` AS `country_name`
					, `states`.`name` AS `state_name`
					, `cities`.`name` AS `city_name`
					FROM `countries` 
					INNER JOIN `states` 
					ON `countries`.`id` = `states`.`country_id`
					INNER JOIN `cities` 
					ON `states`.`name` = `cities`.`state_name`
					WHERE 
					   `states`.`name` LIKE :test_name 
					OR `cities`.`name` LIKE :test_name 
					OR `states`.`phoneme` LIKE :test_phoneme 
					OR `cities`.`phoneme` LIKE :test_phoneme 
					OR `countries`.`id` = :test_country_id
					ORDER BY `countries`.`name`, `cities`.`name`
					");

$split_test_str = str_split("California");
$test_n_str = "%" . implode("%", $split_test_str) . "%";
//$test_p_str = metaphone("Forlì"); // Italy		
//$test_p_str = metaphone("Málaga"); // Spain	
$test_p_str = metaphone("Münster"); // Germany
//$test_p_str = '%' . metaphone("monzter") . '%'; // misspelled lowercase Germany
$test_c_int = 6;

$pdo_stmt->bindParam(':test_name', $test_n_str, PDO::PARAM_STR);
$pdo_stmt->bindParam(':test_phoneme', $test_p_str, PDO::PARAM_STR);
$pdo_stmt->bindParam(':test_country_id', $test_c_int, PDO::PARAM_INT);
  
  if ($pdo_stmt->execute()) {
    while ($row = $pdo_stmt->fetch()) {
      echo "<pre>";
      print_r($row);
	  echo "</pre>";
    }
  }

PDO is a new beast compared to mysqli_
Unlike mysqli_ that has both procedural and OOP, PDO is only OOP
A key feature to take note of that in addition to having “?” placeholders, PDO also has named placeholders.

To get back to the OP topic of “iconv”, you can see that similar to John’s use of the “%” wildcard, I tried an exaggerated use not only changing “needle” to “%needle%” but to “%n%e%e%d%l%e%” in an attempt to get around the “single 2 byte character” problem.
It works, BUT it works too well returning way to many positives.
Having the wildcard at only the beginning and end was better in that a lot of “wrong” results were not returned. But the “single 2 byte character” problem remained.

mysqli_ is a few days older than PDO - they were both first introduced in July 2004 - mysqli_ first and PDO a few days later.

lol I meant “new” as in different, not age. Regardless, I hadn’t realized that mysqli_ and PDO have been around for that long. Where does the time go!

Back to the iconv problem.
To restate the situation.
The database has the names of “cities”, “states” and "countries"1 (cities may be towns, states may be provinces, territories, etc. but are considered cities and states to keep it simple).
A form is used to enter names to be used to search the database for matches.

One approach could be to retrieve all names from the database and use PHP code to put together a result set.
This would be inefficient and would not be a good idea for larger databases.
A compromise could be to restrict users so that only a limited number of database results were used. eg. search for “this entered text” in this country. But putting such restrictions upon users is not always ideal.

What the database contains can be controlled, What users enter into text inputs, not so much.
But it can be manipulated (and indeed should be) upon form submission before it is used in a query.

It is easy enough to remove extra whitespace.
Compensating for misspelled or partial input entries, not so easy.

For example, if the city is “San Francisco” and the user enters " San francisco", no problem.
But if the city is “Đà Nẵng” and the user enters “Da Nang”, what then?

The main objective is to find the best possible balance between true / false - positives / negatives in the result set.

A first attempt was to use the iconv function. It had been hoped that by using “//TRANSLIT” that “Đà Nẵng” could be found as “Da Nang” and returned as a match.
Unfortunately, the best was either “?? N?ng” using //TRANSLIT or, using “//IGNORE” which resulted in " N ng".

This meant that a search for “Da Nang” would result in nothing being returned as a match even though the city did exist in the database.
Hence the reason for starting this topic, poring through PHP documention for possible alternative approaches, and quite a bit of trial and error testing.

Why was iconv being so troublesome?
Some characters are 7-bit, some 8-bit, others two bytes or more.
iconv did OK with converting some characters, but not with others.

Using mb_convert_encoding had better results than attempts to use iconv had produced.
It converted “Forlì” to “Forli” and “Málaga” to “Malaga”
But it failed with “Đà Nẵng” and “Muḩāfaz̧at al Ḩudaydah”

It became clear that I was trying to push iconv and mb_convert_encoding to do more than they were intended to do when “hack” kludges began to creep into the code.

One common solution is to have a “replacement” array of characters. This may work for smaller numbers of characters, but I for one would prefer to not do the tedious work involved with manually mapping out replacements for a large number of possible characters.

The Transliterator class to the rescue.

It took a while to determine what “from” and “to” to use of the 294 possible, (in this case, for my somewhat limited 575 rows it was “Latin-ASCII”) but once found, success !

Then a couple of ALTER TABLE queries to add “transliteration” fields and an INSERT INTO query and on to the next problem on the list, dealing with misspelled and partial input entries.

<?php
declare(strict_types=1);
error_reporting(E_ALL);
ini_set('display_errors', 'true');

$transliterator_id_select = "";
$input_text = "";
$output_text = "";

function transliterate_text(string $transliterator_id, string $text_string): string {
  $transliterated_text = transliterator_transliterate($transliterator_id, $text_string);
  return $transliterated_text;
}

$transliterator_list_ids = transliterator_list_ids();
sort($transliterator_list_ids);
if ($_POST) {
  if ($_POST['transliterator_id_select']) {
    $transliterator_id_select = $_POST['transliterator_id_select'];
  }
  if ($_POST['input_text']) {
    $input_text = $_POST['input_text'];
  }
  $output_text = transliterate_text($transliterator_id_select, $input_text);
}
?>
<!DOCTYPE HTML>
<html lang="en">
<head>
<title>Intl Transliteration</title>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width,initial-scale=1" />
<style type="text/css">
#display_div {
  border: 3px double #444;
  padding: 0.5em 1em;
  overflow: auto;
}
</style>
<script type="text/javascript">
// script needed before the DOM is loaded here
</script>
</head>
<body>
<h1>Intl Transliteration</h1>
<form action="#" method="post">
  <fieldset>
    <legend>Intl Transliteration</legend>
	
	<label for="transliterator_id_select">Start-End</label>
	<select name="transliterator_id_select" id="transliterator_id_select">
<?php 
foreach ($transliterator_list_ids as $transliterator_list_id) {
  if ($transliterator_id_select === $transliterator_list_id) {
    echo '<option value="' . $transliterator_list_id   . '" selected="selected">' . $transliterator_list_id . '</option>';
  } else if ( (!$_POST) && ($transliterator_list_id == "Latin-ASCII") ) {
    echo '<option value="' . $transliterator_list_id   . '" selected="selected">' . $transliterator_list_id . '</option>';
  } else {
    echo '<option value="' . $transliterator_list_id   . '">' . $transliterator_list_id . '</option>';
  }
}
?>
    </select>

	<textarea name="input_text" rows="10" cols="50"><?php  echo $input_text; ?></textarea>

	<input type="submit" value="Transliterate" />
	
  </fieldset>
</form>
  <div><?php  echo $transliterator_id_select; ?></div>
  <div id="display_div" rows="10" cols="50"><pre><?php  echo $output_text; ?></pre></div>
<script type="text/javascript">
// script that needs the DOM to be loaded here
</script>
</body>
</html>

This topic was automatically closed 91 days after the last reply. New replies are no longer allowed.